Merge pull request #179 from openclimatefix/jacob/sensor-attention

jacobbieker · web-flow · commit f5cd889f0ba1 · 2024-04-26T10:24:36.000+01:00
Add test data and fix encoder for multiple sites and sensors
diff --git a/pvnet/data/wind_datamodule.py b/pvnet/data/wind_datamodule.py
@@ -13,7 +13,7 @@ class WindDataModule(BaseDataModule):
     def _get_datapipe(self, start_time, end_time):
         data_pipeline = windnet_netcdf_datapipe(
             self.configuration,
-            keys=["wind", "nwp"],
+            keys=["wind", "nwp", "sensor"],
         )
 
         data_pipeline = (
@@ -26,7 +26,7 @@ def _get_datapipe(self, start_time, end_time):
     def _get_premade_batches_datapipe(self, subdir, shuffle=False):
         filenames = list(glob.glob(f"{self.batch_dir}/{subdir}/*.nc"))
         data_pipeline = windnet_netcdf_datapipe(
-            keys=["wind", "nwp"],
+            keys=["wind", "nwp", "sensor"],
             filenames=filenames,
         )
         data_pipeline = (
diff --git a/pvnet/models/multimodal/site_encoders/encoders.py b/pvnet/models/multimodal/site_encoders/encoders.py
@@ -2,6 +2,7 @@
 
 """
 
+import einops
 import torch
 from ocf_datapipes.batch import BatchKey
 from torch import nn
@@ -128,6 +129,8 @@ def __init__(
         target_id_dim: int = 318,
         target_key_to_use: str = "gsp",
         input_key_to_use: str = "pv",
+        num_channels: int = 1,
+        num_sites_in_inference: int = 1,
     ):
         """A simple attention-based model with a single multihead attention layer
 
@@ -148,6 +151,13 @@ def __init__(
             target_id_dim: The number of unique IDs.
             target_key_to_use: The key to use for the target in the attention layer.
             input_key_to_use: The key to use for the input in the attention layer.
+            num_channels: Number of channels in the input data. For single site generation,
+                this will be 1, as there is not channel dimension, for Sensors,
+                 this will probably be higher than that
+            num_sites_in_inference: Number of sites to use in inference.
+                This is used to determine the number of sites to use in the
+                 attention layer, for a single site, 1 works, while for multiple sites
+                (such as multiple sensors), this would be higher than that
 
         """
         super().__init__(sequence_length, num_sites, out_features)
@@ -158,15 +168,18 @@ def __init__(
         self.use_id_in_value = use_id_in_value
         self.target_key_to_use = target_key_to_use
         self.input_key_to_use = input_key_to_use
+        self.num_channels = num_channels
+        self.num_sites_in_inference = num_sites_in_inference
 
         if use_id_in_value:
             self.value_id_embedding = nn.Embedding(num_sites, id_embed_dim)
 
         self._value_encoder = nn.Sequential(
             ResFCNet2(
-                in_features=sequence_length + int(use_id_in_value) * id_embed_dim,
+                in_features=sequence_length * self.num_channels
+                + int(use_id_in_value) * id_embed_dim,
                 out_features=out_features,
-                fc_hidden_features=sequence_length,
+                fc_hidden_features=sequence_length * self.num_channels,
                 n_res_blocks=n_kv_res_blocks,
                 res_block_layers=kv_res_block_layers,
                 dropout_frac=0,
@@ -175,9 +188,9 @@ def __init__(
 
         self._key_encoder = nn.Sequential(
             ResFCNet2(
-                in_features=sequence_length + id_embed_dim,
+                in_features=id_embed_dim + sequence_length * self.num_channels,
                 out_features=kdim,
-                fc_hidden_features=id_embed_dim + sequence_length,
+                fc_hidden_features=id_embed_dim + sequence_length * self.num_channels,
                 n_res_blocks=n_kv_res_blocks,
                 res_block_layers=kv_res_block_layers,
                 dropout_frac=0,
@@ -192,6 +205,20 @@ def __init__(
             batch_first=True,
         )
 
+    def _encode_inputs(self, x):
+        # Shape: [batch size, sequence length, PV site] -> [8, 197, 1]
+        # Shape: [batch size,  station_id, sequence length,  channels] -> [8, 197, 26, 23]
+        input_data = x[BatchKey[f"{self.input_key_to_use}"]]
+        if len(input_data.shape) == 4:  # Has multiple channels
+            input_data = input_data[:, :, : self.sequence_length]
+            input_data = einops.rearrange(input_data, "b id s c -> b (s c) id")
+        else:
+            input_data = input_data[:, : self.sequence_length]
+        site_seqs = input_data.float()
+        batch_size = site_seqs.shape[0]
+        site_seqs = site_seqs.swapaxes(1, 2)  # [batch size, Site ID, sequence length]
+        return site_seqs, batch_size
+
     def _encode_query(self, x):
         # Select the first one
         if self.target_key_to_use == "gsp":
@@ -206,34 +233,29 @@ def _encode_query(self, x):
         return query
 
     def _encode_key(self, x):
-        # Shape: [batch size, sequence length, PV site]
-        site_seqs = x[BatchKey[f"{self.input_key_to_use}"]][:, : self.sequence_length].float()
-        batch_size = site_seqs.shape[0]
+        site_seqs, batch_size = self._encode_inputs(x)
 
         # wind ID embeddings are the same for each sample
         site_id_embed = torch.tile(self.site_id_embedding(self._ids), (batch_size, 1, 1))
         # Each concated (wind sequence, wind ID embedding) is processed with encoder
-        x_seq_in = torch.cat((site_seqs.swapaxes(1, 2), site_id_embed), dim=2).flatten(0, 1)
+        x_seq_in = torch.cat((site_seqs, site_id_embed), dim=2).flatten(0, 1)
         key = self._key_encoder(x_seq_in)
 
         # Reshape to [batch size, PV site, kdim]
         key = key.unflatten(0, (batch_size, self.num_sites))
         return key
 
     def _encode_value(self, x):
-        # Shape: [batch size, sequence length, PV site]
-        site_seqs = x[BatchKey[f"{self.input_key_to_use}"]][:, : self.sequence_length].float()
-        batch_size = site_seqs.shape[0]
+        site_seqs, batch_size = self._encode_inputs(x)
 
         if self.use_id_in_value:
             # wind ID embeddings are the same for each sample
             site_id_embed = torch.tile(self.value_id_embedding(self._ids), (batch_size, 1, 1))
             # Each concated (wind sequence, wind ID embedding) is processed with encoder
-            x_seq_in = torch.cat((site_seqs.swapaxes(1, 2), site_id_embed), dim=2).flatten(0, 1)
+            x_seq_in = torch.cat((site_seqs, site_id_embed), dim=2).flatten(0, 1)
         else:
             # Encode each PV sequence independently
-            x_seq_in = site_seqs.swapaxes(1, 2).flatten(0, 1)
-
+            x_seq_in = site_seqs.flatten(0, 1)
         value = self._value_encoder(x_seq_in)
 
         # Reshape to [batch size, PV site, vdim]
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -14,6 +14,7 @@
 
 import pvnet
 from pvnet.data.datamodule import DataModule
+from pvnet.data.wind_datamodule import WindDataModule
 
 import pvnet.models.multimodal.encoders.encoders3d
 import pvnet.models.multimodal.linear_networks.networks
@@ -161,6 +162,22 @@ def sample_pv_batch(sample_batch):
     return pv_data
 
 
+@pytest.fixture()
+def sample_wind_batch():
+    dm = WindDataModule(
+        configuration=None,
+        batch_size=2,
+        num_workers=0,
+        prefetch_factor=None,
+        train_period=[None, None],
+        val_period=[None, None],
+        test_period=[None, None],
+        batch_dir="tests/test_data/sample_wind_batches",
+    )
+    batch = next(iter(dm.train_dataloader()))
+    return batch
+
+
 @pytest.fixture()
 def model_minutes_kwargs():
     kwargs = dict(
@@ -193,6 +210,20 @@ def site_encoder_model_kwargs():
     return kwargs
 
 
+@pytest.fixture()
+def site_encoder_sensor_model_kwargs():
+    # Used to test site encoder model on PV data
+    kwargs = dict(
+        sequence_length=180 // 5 + 1,
+        num_sites=26,
+        out_features=128,
+        num_channels=23,
+        target_key_to_use="wind",
+        input_key_to_use="sensor",
+    )
+    return kwargs
+
+
 @pytest.fixture()
 def raw_multimodal_model_kwargs(model_minutes_kwargs):
     kwargs = dict(
diff --git a/tests/models/multimodal/site_encoders/test_encoders.py b/tests/models/multimodal/site_encoders/test_encoders.py
@@ -32,6 +32,10 @@ def test_singleattentionnetwork_forward(sample_batch, site_encoder_model_kwargs)
     _test_model_forward(sample_batch, SingleAttentionNetwork, site_encoder_model_kwargs)
 
 
+def test_singleattentionnetwork_forward_4d(sample_wind_batch, site_encoder_sensor_model_kwargs):
+    _test_model_forward(sample_wind_batch, SingleAttentionNetwork, site_encoder_sensor_model_kwargs)
+
+
 # Test model backward on all models
 def test_simplelearnedaggregator_backward(sample_batch, site_encoder_model_kwargs):
     _test_model_backward(sample_batch, SimpleLearnedAggregator, site_encoder_model_kwargs)
diff --git a/tests/test_data/sample_wind_batches/data_configuration.yaml b/tests/test_data/sample_wind_batches/data_configuration.yaml
@@ -0,0 +1,96 @@
+general:
+  description: WindNet configuration for Leonardo
+  name: windnet_india
+
+input_data:
+  default_forecast_minutes: 2880
+  default_history_minutes: 60
+  data_source_which_defines_geospatial_locations: "wind"
+  nwp:
+    ecmwf:
+      # Path to ECMWF NWP data in zarr format
+      # n.b. It is not necessary to use multiple or any NWP data. These entries can be removed
+      nwp_zarr_path: "/mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/NWP/ECMWF/nw-india/zarr/20*.zarr.zip"
+      history_minutes: 60
+      forecast_minutes: 2880
+      time_resolution_minutes: 60
+      nwp_channels:
+        #- hcc
+        #- lcc
+        #- mcc
+        #- prate
+        #- sde
+        #- sr
+        - t2m
+        #- tcc
+        - u10
+        - u100
+        - u200
+        - v10
+        - v100
+        - v200
+      nwp_image_size_pixels_height: 168 # roughtly equivalent to ukv 24 pixels
+      nwp_image_size_pixels_width: 168
+      x_dim_name: "longitude"
+      y_dim_name: "latitude"
+      nwp_provider: "ecmwf"
+      dropout_timedeltas_minutes: [-360] # 6 hours
+      # Dropout applied with this probability
+      dropout_fraction: 1.0
+      #start_datetime: "2021-01-01 00:00:00"
+      #end_datetime: "2024-01-01 00:00:00"
+  #    excarta:
+  #      nwp_zarr_path: "/mnt/storage_b/nwp/excarta/hindcast.zarr"
+  #      history_minutes: 60
+  #      forecast_minutes: 2160 # 48 hours won't work much, as its only midnight ones, maybe 24 hours to ensure more coverage
+  #      time_resolution_minutes: 60
+  #      nwp_channels:
+  #        - 10u
+  #        - 100u
+  #        - 10v
+  #        - 100v
+  #        - surface_pressure
+  #        #- mean_sea_level_pressure
+  #      nwp_image_size_pixels_height: 64 # roughtly equivalent to ukv 24 pixels
+  #      nwp_image_size_pixels_width: 64
+  #      nwp_provider: "excarta"
+  #      x_dim_name: "longitude"
+  #      y_dim_name: "latitude"
+  #      dropout_timedeltas_minutes: [ -360 ] # 6 hours
+  #      # Dropout applied with this probability
+  #      dropout_fraction: 1.0
+  wind:
+    wind_files_groups:
+      - label: india
+        wind_filename: /mnt/storage_ssd_4tb/india_wind_data.nc
+        wind_metadata_filename: /mnt/storage_ssd_4tb/india_wind_metadata.csv
+    get_center: true
+    n_wind_systems_per_example: 1
+    #start_datetime: "2021-01-01 00:00:00"
+    #end_datetime: "2024-01-01 00:00:00"
+  sensor:
+    #sensor_files_groups:
+    #    - label: meteomatics
+    sensor_filename: "/mnt/storage_b/nwp/meteomatics/nw_india/wind*.zarr.zip"
+    get_center: false
+    history_minutes: 60
+    forecast_minutes: 2880
+    #n_sensor_systems_per_example: 26
+    time_resolution_minutes: 15
+    #x_dim_name: "lon"
+    #y_dim_name: "lat"
+    sensor_variables:
+      - 100u
+      - 100v
+      - 10u
+      - 10v
+      - 200u
+      - 200v
+      - cape:Jkg
+      - air_density_25m:kgm3
+      - air_density_10m:kgm3
+      - air_density_100m:kgm3
+      - air_density_200m:kgm3
+      - wind_gusts_200m:ms
+      - wind_gusts_100m:ms
+      - wind_gusts_10m:ms
diff --git a/tests/test_data/sample_wind_batches/datamodule.yaml b/tests/test_data/sample_wind_batches/datamodule.yaml
@@ -0,0 +1,15 @@
+_target_: pvnet.data.wind_datamodule.WindDataModule
+configuration: /home/jacob/PVNet/configs/datamodule/configuration/leonardo_wind_configuration.yaml
+num_workers: 1
+prefetch_factor: 2
+batch_size: 8
+batch_dir: /mnt/storage_a/windnet_india_batches_large_meteomatics
+train_period:
+  - "2019-01-01"
+  - "2022-11-29"
+val_period:
+  - "2022-12-01"
+  - "2023-12-31"
+test_period:
+  - "2023-09-01"
+  - "2023-12-31"
diff --git a/tests/test_data/sample_wind_batches/train/000000.nc b/tests/test_data/sample_wind_batches/train/000000.nc
diff --git a/tests/test_data/sample_wind_batches/train/000001.nc b/tests/test_data/sample_wind_batches/train/000001.nc