openclimatefix
diff --git a/‎README.md
Lines changed: 6 additions & 6 deletions b/‎README.md
Lines changed: 6 additions & 6 deletions
diff --git a/‎pvnet/data/__init__.py
Lines changed: 2 additions & 0 deletions b/‎pvnet/data/__init__.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎pvnet/data/base.py
Lines changed: 0 additions & 116 deletions b/‎pvnet/data/base.py
Lines changed: 0 additions & 116 deletions
diff --git a/‎pvnet/data/base_datamodule.py
Lines changed: 93 additions & 0 deletions b/‎pvnet/data/base_datamodule.py
Lines changed: 93 additions & 0 deletions
diff --git a/‎pvnet/data/pv_site_datamodule.py
Lines changed: 0 additions & 67 deletions b/‎pvnet/data/pv_site_datamodule.py
Lines changed: 0 additions & 67 deletions
@@ -145,20 +145,20 @@ This is also where you can update the train, val & test periods to cover the dat
 
 ### Running the batch creation script
 
-Run the `save_batches.py` script to create batches with the parameters specified in the datamodule config (`streamed_batches.yaml` in this example):
+Run the `save_samples.py` script to create batches with the parameters specified in the datamodule config (`streamed_batches.yaml` in this example):
 
 ```bash
-python scripts/save_batches.py
+python scripts/save_samples.py
 ```
 PVNet uses
 [hydra](https://hydra.cc/) which enables us to pass variables via the command
 line that will override the configuration defined in the `./configs` directory, like this:
 
 ```bash
-python scripts/save_batches.py datamodule=streamed_batches datamodule.batch_output_dir="./output" datamodule.num_train_batches=10 datamodule.num_val_batches=5
+python scripts/save_samples.py datamodule=streamed_batches datamodule.sample_output_dir="./output" datamodule.num_train_batches=10 datamodule.num_val_batches=5
 ```
 
-`scripts/save_batches.py` needs a config under `PVNet/configs/datamodule`. You can adapt `streamed_batches.yaml` or create your own in the same folder.
+`scripts/save_samples.py` needs a config under `PVNet/configs/datamodule`. You can adapt `streamed_batches.yaml` or create your own in the same folder.
 
 If downloading private data from a GCP bucket make sure to authenticate gcloud (the public satellite data does not need authentication):
 
@@ -197,7 +197,7 @@ Make sure to update the following config files before training your model:
 2. In `configs/model/local_multimodal.yaml`:
     - update the list of encoders to reflect the data sources you are using. If you are using different NWP sources, the encoders for these should follow the same structure with two important updates:
         - `in_channels`: number of variables your NWP source supplies
-        - `image_size_pixels`: spatial crop of your NWP data. It depends on the spatial resolution of your NWP; should match `nwp_image_size_pixels_height` and/or `nwp_image_size_pixels_width` in `datamodule/example_configs.yaml`, unless transformations such as coarsening was applied (e. g. as for ECMWF data)
+        - `image_size_pixels`: spatial crop of your NWP data. It depends on the spatial resolution of your NWP; should match `image_size_pixels_height` and/or `image_size_pixels_width` in `datamodule/configuration/site_example_configuration.yaml` for the NWP, unless transformations such as coarsening was applied (e. g. as for ECMWF data)
 3. In `configs/local_trainer.yaml`:
     - set `accelerator: 0` if running on a system without a supported GPU
 
@@ -216,7 +216,7 @@ defaults:
   - hydra: default.yaml
 ```
 
-Assuming you ran the `save_batches.py` script to generate some premade train and
+Assuming you ran the `save_samples.py` script to generate some premade train and
 val data batches, you can now train PVNet by running:
 
 ```
 
@@ -1,2 +1,4 @@
 """Data parts"""
+from .site_datamodule import SiteDataModule
+from .uk_regional_datamodule import DataModule
 from .utils import BatchSplitter
@@ -0,0 +1,93 @@
+""" Data module for pytorch lightning """
+from lightning.pytorch import LightningDataModule
+from ocf_data_sampler.numpy_sample.collate import stack_np_samples_into_batch
+from ocf_datapipes.batch import (
+    NumpyBatch,
+    TensorBatch,
+    batch_to_tensor,
+)
+from torch.utils.data import DataLoader, Dataset
+
+
+def collate_fn(samples: list[NumpyBatch]) -> TensorBatch:
+    """Convert a list of NumpySample samples to a tensor batch"""
+    return batch_to_tensor(stack_np_samples_into_batch(samples))
+
+
+class BaseDataModule(LightningDataModule):
+    """Base Datamodule for training pvnet and using pvnet pipeline in ocf-data-sampler."""
+
+    def __init__(
+        self,
+        configuration: str | None = None,
+        sample_dir: str | None = None,
+        batch_size: int = 16,
+        num_workers: int = 0,
+        prefetch_factor: int | None = None,
+        train_period: list[str | None] = [None, None],
+        val_period: list[str | None] = [None, None],
+    ):
+        """Base Datamodule for training pvnet architecture.
+
+        Can also be used with pre-made batches if `sample_dir` is set.
+
+        Args:
+            configuration: Path to ocf-data-sampler configuration file.
+            sample_dir: Path to the directory of pre-saved samples. Cannot be used together with
+                `configuration` or '[train/val]_period'.
+            batch_size: Batch size.
+            num_workers: Number of workers to use in multiprocess batch loading.
+            prefetch_factor: Number of data will be prefetched at the end of each worker process.
+            train_period: Date range filter for train dataloader.
+            val_period: Date range filter for val dataloader.
+
+        """
+        super().__init__()
+
+        if not ((sample_dir is not None) ^ (configuration is not None)):
+            raise ValueError("Exactly one of `sample_dir` or `configuration` must be set.")
+
+        if sample_dir is not None:
+            if any([period != [None, None] for period in [train_period, val_period]]):
+                raise ValueError("Cannot set `(train/val)_period` with presaved samples")
+
+        self.configuration = configuration
+        self.sample_dir = sample_dir
+        self.train_period = train_period
+        self.val_period = val_period
+
+        self._common_dataloader_kwargs = dict(
+            batch_size=batch_size,
+            sampler=None,
+            batch_sampler=None,
+            num_workers=num_workers,
+            collate_fn=collate_fn,
+            pin_memory=False,
+            drop_last=False,
+            timeout=0,
+            worker_init_fn=None,
+            prefetch_factor=prefetch_factor,
+            persistent_workers=False,
+        )
+
+    def _get_streamed_samples_dataset(self, start_time, end_time) -> Dataset:
+        raise NotImplementedError
+
+    def _get_premade_samples_dataset(self, subdir) -> Dataset:
+        raise NotImplementedError
+
+    def train_dataloader(self) -> DataLoader:
+        """Construct train dataloader"""
+        if self.sample_dir is not None:
+            dataset = self._get_premade_samples_dataset("train")
+        else:
+            dataset = self._get_streamed_samples_dataset(*self.train_period)
+        return DataLoader(dataset, shuffle=True, **self._common_dataloader_kwargs)
+
+    def val_dataloader(self) -> DataLoader:
+        """Construct val dataloader"""
+        if self.sample_dir is not None:
+            dataset = self._get_premade_samples_dataset("val")
+        else:
+            dataset = self._get_streamed_samples_dataset(*self.val_period)
+        return DataLoader(dataset, shuffle=False, **self._common_dataloader_kwargs)