openclimatefix
diff --git a/‎.gitignore
Lines changed: 1 addition & 0 deletions b/‎.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎MANIFEST.in
Lines changed: 1 addition & 0 deletions b/‎MANIFEST.in
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md
Lines changed: 6 additions & 6 deletions b/‎README.md
Lines changed: 6 additions & 6 deletions
diff --git a/‎configs.example/datamodule/configuration/example_configuration.yaml
Lines changed: 80 additions & 120 deletions b/‎configs.example/datamodule/configuration/example_configuration.yaml
Lines changed: 80 additions & 120 deletions
diff --git a/‎configs.example/datamodule/premade_batches.yaml
Lines changed: 5 additions & 3 deletions b/‎configs.example/datamodule/premade_batches.yaml
Lines changed: 5 additions & 3 deletions
diff --git a/‎configs.example/datamodule/streamed_batches.yaml
Lines changed: 4 additions & 6 deletions b/‎configs.example/datamodule/streamed_batches.yaml
Lines changed: 4 additions & 6 deletions
diff --git a/‎pvnet/data/__init__.py
Lines changed: 2 additions & 1 deletion b/‎pvnet/data/__init__.py
Lines changed: 2 additions & 1 deletion
@@ -140,3 +140,4 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+.DS_Store
@@ -1 +1,2 @@
 include *.txt
+recursive-include pvnet/models/model_cards *.md
@@ -148,20 +148,20 @@ This is also where you can update the train, val & test periods to cover the dat
 
 ### Running the batch creation script
 
-Run the `save_batches.py` script to create batches with the parameters specified in the datamodule config (`streamed_batches.yaml` in this example):
+Run the `save_samples.py` script to create batches with the parameters specified in the datamodule config (`streamed_batches.yaml` in this example):
 
 ```bash
-python scripts/save_batches.py
+python scripts/save_samples.py
 ```
 PVNet uses
 [hydra](https://hydra.cc/) which enables us to pass variables via the command
 line that will override the configuration defined in the `./configs` directory, like this:
 
 ```bash
-python scripts/save_batches.py datamodule=streamed_batches datamodule.batch_output_dir="./output" datamodule.num_train_batches=10 datamodule.num_val_batches=5
+python scripts/save_samples.py datamodule=streamed_batches datamodule.sample_output_dir="./output" datamodule.num_train_batches=10 datamodule.num_val_batches=5
 ```
 
-`scripts/save_batches.py` needs a config under `PVNet/configs/datamodule`. You can adapt `streamed_batches.yaml` or create your own in the same folder.
+`scripts/save_samples.py` needs a config under `PVNet/configs/datamodule`. You can adapt `streamed_batches.yaml` or create your own in the same folder.
 
 If downloading private data from a GCP bucket make sure to authenticate gcloud (the public satellite data does not need authentication):
 
@@ -200,7 +200,7 @@ Make sure to update the following config files before training your model:
 2. In `configs/model/local_multimodal.yaml`:
     - update the list of encoders to reflect the data sources you are using. If you are using different NWP sources, the encoders for these should follow the same structure with two important updates:
         - `in_channels`: number of variables your NWP source supplies
-        - `image_size_pixels`: spatial crop of your NWP data. It depends on the spatial resolution of your NWP; should match `nwp_image_size_pixels_height` and/or `nwp_image_size_pixels_width` in `datamodule/example_configs.yaml`, unless transformations such as coarsening was applied (e. g. as for ECMWF data)
+        - `image_size_pixels`: spatial crop of your NWP data. It depends on the spatial resolution of your NWP; should match `image_size_pixels_height` and/or `image_size_pixels_width` in `datamodule/configuration/site_example_configuration.yaml` for the NWP, unless transformations such as coarsening was applied (e. g. as for ECMWF data)
 3. In `configs/local_trainer.yaml`:
     - set `accelerator: 0` if running on a system without a supported GPU
 
@@ -219,7 +219,7 @@ defaults:
   - hydra: default.yaml
 ```
 
-Assuming you ran the `save_batches.py` script to generate some premade train and
+Assuming you ran the `save_samples.py` script to generate some premade train and
 val data batches, you can now train PVNet by running:
 
 ```
 
@@ -1,124 +1,48 @@
 general:
-  description: Example data config for creating PVNet batches
-  name: example_pvnet
+  description: Example config for producing PVNet samples
+  name: example_config
 
 input_data:
-  default_history_minutes: 120
-  default_forecast_minutes: 480
+
+  # Either use Site OR GSP configuration
+  site:
+    # Path to Site data in NetCDF format
+    file_path: PLACEHOLDER.nc
+    # Path to metadata in CSV format
+    metadata_file_path: PLACEHOLDER.csv
+    time_resolution_minutes: 15
+    interval_start_minutes: -60
+    # Specified for intraday currently
+    interval_end_minutes: 480
+    dropout_timedeltas_minutes: null
+    dropout_fraction: 0 # Fraction of samples with dropout
 
   gsp:
-    # Path to the GSP data. This should be a zarr file
+    # Path to GSP data in zarr format
     # e.g. gs://solar-pv-nowcasting-data/PV/GSP/v7/pv_gsp.zarr
-    gsp_zarr_path: PLACEHOLDER.zarr
-    history_minutes: 120
-    forecast_minutes: 480
+    zarr_path: PLACEHOLDER.zarr
+    interval_start_minutes: -60
+    # Specified for intraday currently
+    interval_end_minutes: 480
     time_resolution_minutes: 30
-    # A random value from the list below will be chosen as the delay when dropout is used
+    # Random value from the list below will be chosen as the delay when dropout is used
     # If set to null no dropout is applied. Only values before t0 are dropped out for GSP.
     # Values after t0 are assumed as targets and cannot be dropped.
     dropout_timedeltas_minutes: null
     dropout_fraction: 0 # Fraction of samples with dropout
 
-  pv:
-    pv_files_groups:
-      - label: solar_sheffield_passiv
-        # Path to the site-level PV data. This should be a netcdf
-        # e.g gs://solar-pv-nowcasting-data/PV/Passive/ocf_formatted/v0/passiv.netcdf
-        pv_filename: PLACEHOLDER.netcdf
-        # Path to the site-level PV metadata. This choudl be a csv
-        # e.g gs://solar-pv-nowcasting-data/PV/Passive/ocf_formatted/v0/system_metadata.csv
-        pv_metadata_filename: PLACEHOLDER.csv
-    # This is the list of pv_ml_ids to be sliced from the PV site level data
-    # The IDs below are 349 of the PV systems which have very little NaN data in the historic data
-    # and which are still reporting live (as of Oct 2023)
-    pv_ml_ids:
-      [
-        154, 155, 156, 158, 159, 160, 162, 164, 165, 166, 167, 168, 169, 171, 173, 177, 178, 179,
-        181, 182, 185, 186, 187, 188, 189, 190, 191, 192, 193, 197, 198, 199, 200, 202, 204, 205,
-        206, 208, 209, 211, 214, 215, 216, 217, 218, 219, 220, 221, 225, 229, 230, 232, 233, 234,
-        236, 242, 243, 245, 252, 254, 255, 256, 257, 258, 260, 261, 262, 265, 267, 268, 272, 273,
-        275, 276, 277, 280, 281, 282, 283, 287, 289, 291, 292, 293, 294, 295, 296, 297, 298, 301,
-        302, 303, 304, 306, 307, 309, 310, 311, 317, 318, 319, 320, 321, 322, 323, 325, 326, 329,
-        332, 333, 335, 336, 338, 340, 342, 344, 345, 346, 348, 349, 352, 354, 355, 356, 357, 360,
-        362, 363, 368, 369, 370, 371, 372, 374, 375, 376, 378, 380, 382, 384, 385, 388, 390, 391,
-        393, 396, 397, 398, 399, 400, 401, 403, 404, 405, 406, 407, 409, 411, 412, 413, 414, 415,
-        416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 429, 431, 435, 437, 438, 440,
-        441, 444, 447, 450, 451, 453, 456, 457, 458, 459, 464, 465, 466, 467, 468, 470, 471, 473,
-        474, 476, 477, 479, 480, 481, 482, 485, 486, 488, 490, 491, 492, 493, 496, 498, 501, 503,
-        506, 507, 508, 509, 510, 511, 512, 513, 515, 516, 517, 519, 520, 521, 522, 524, 526, 527,
-        528, 531, 532, 536, 537, 538, 540, 541, 542, 543, 544, 545, 549, 550, 551, 552, 553, 554,
-        556, 557, 560, 561, 563, 566, 568, 571, 572, 575, 576, 577, 579, 580, 581, 582, 584, 585,
-        588, 590, 594, 595, 597, 600, 602, 603, 604, 606, 611, 613, 614, 616, 618, 620, 622, 623,
-        624, 625, 626, 628, 629, 630, 631, 636, 637, 638, 640, 641, 642, 644, 645, 646, 650, 651,
-        652, 653, 654, 655, 657, 660, 661, 662, 663, 666, 667, 668, 670, 675, 676, 679, 681, 683,
-        684, 685, 687, 696, 698, 701, 702, 703, 704, 706, 710, 722, 723, 724, 725, 727, 728, 729,
-        730, 732, 733, 734, 735, 736, 737
-      ]
-    history_minutes: 180
-    forecast_minutes: 0
-    time_resolution_minutes: 5
-    # A random value from the list below will be chosen as the delay when dropout is used.
-    # If set to null no dropout is applied. All PV systems are dropped together with this setting.
-    dropout_timedeltas_minutes: null
-    dropout_fraction: 0 # Fraction of samples with dropout
-    # A random value from the list below will be chosen as the delay when system dropout is used.
-    # If set to null no dropout is applied. All PV systems are indpendently with this setting.
-    system_dropout_timedeltas_minutes: null
-    # For ech sample a differnt dropout probability is used which is uniformly sampled from the min
-    # and max below
-    system_dropout_fraction_min: 0
-    system_dropout_fraction_max: 0
-
   nwp:
-    ukv:
-      nwp_provider: ukv
-      nwp_zarr_path:
-        # Path(s) to UKV NWP data in zarr format
-        # e.g. gs://solar-pv-nowcasting-data/NWP/UK_Met_Office/UKV_intermediate_version_7.zarr
-        - PLACEHOLDER.zarr
-      history_minutes: 120
-      forecast_minutes: 480
-      time_resolution_minutes: 60
-      nwp_channels:
-        # These variables exist in the CEDA training set and in the live MetOffice live service
-        - t # 2-metre temperature
-        - dswrf # downwards short-wave radiation flux
-        - dlwrf # downwards long-wave radiation flux
-        - hcc # high cloud cover
-        - mcc # medium cloud cover
-        - lcc # low cloud cover
-        - sde # snow depth water equivalent
-        - r # relative humidty
-        - vis # visibility
-        - si10 # 10-metre wind speed
-        - wdir10 # 10-metre wind direction
-        - prate # precipitation rate
-        # These variables exist in CEDA training data but not in the live MetOffice live service
-        - hcct # height of convective cloud top, meters above surface. NaN if no clouds
-        - cdcb # height of lowest cloud base > 3 oktas
-        - dpt # dew point temperature
-        - prmsl # mean sea level pressure
-        - h # geometrical? (maybe geopotential?) height
-      nwp_image_size_pixels_height: 24
-      nwp_image_size_pixels_width: 24
-      # A random value from the list below will be chosen as the delay when dropout is used
-      # If set to null no dropout is applied. Values must be negative.
-      dropout_timedeltas_minutes: [-180]
-      # Dropout applied with this probability
-      dropout_fraction: 1.0
-      # How long after the NWP init-time are we still willing to use this forecast
-      # If null we use each init-time for all steps it covers
-      max_staleness_minutes: null
 
     ecmwf:
-      nwp_provider: ecmwf
+      provider: ecmwf
       # Path to ECMWF NWP data in zarr format
       # n.b. It is not necessary to use multiple or any NWP data. These entries can be removed
-      nwp_zarr_path: PLACEHOLDER.zarr
-      history_minutes: 120
-      forecast_minutes: 480
+      zarr_path: PLACEHOLDER.zarr
+      interval_start_minutes: -60
+      # Specified for intraday currently
+      interval_end_minutes: 480
       time_resolution_minutes: 60
-      nwp_channels:
+      channels:
         - t2m # 2-metre temperature
         - dswrf # downwards short-wave radiation flux
         - dlwrf # downwards long-wave radiation flux
@@ -136,23 +60,61 @@ input_data:
         - v10 # 10-metre V component of wind speed
         - v100 # 100-metre V component of wind speed
         - v200 # 200-metre V component of wind speed
-      nwp_image_size_pixels_height: 12 # roughly equivalent to UKV 24 pixels
-      nwp_image_size_pixels_width: 12
-      dropout_timedeltas_minutes: [-180]
-      dropout_fraction: 1.0
+      # The following channels are accumulated and need to be diffed
+      accum_channels:
+        - dswrf # downwards short-wave radiation flux
+        - dlwrf # downwards long-wave radiation flux
+        - sr # direct solar radiation
+        - duvrs # downwards UV radiation at surface
+      image_size_pixels_height: 24
+      image_size_pixels_width: 24
+      dropout_timedeltas_minutes: [-360]
+      dropout_fraction: 1.0 # Fraction of samples with dropout
+      max_staleness_minutes: null
+
+    ukv:
+      provider: ukv
+      # Path to UKV NWP data in zarr format
+      # e.g. gs://solar-pv-nowcasting-data/NWP/UK_Met_Office/UKV_intermediate_version_7.zarr
+      # n.b. It is not necessary to use multiple or any NWP data. These entries can be removed
+      zarr_path: PLACEHOLDER.zarr
+      interval_start_minutes: -60
+      # Specified for intraday currently
+      interval_end_minutes: 480
+      time_resolution_minutes: 60
+      channels:
+        - t # 2-metre temperature
+        - dswrf # downwards short-wave radiation flux
+        - dlwrf # downwards long-wave radiation flux
+        - hcc # high cloud cover
+        - mcc # medium cloud cover
+        - lcc # low cloud cover
+        - sde # snow depth water equivalent
+        - r # relative humidty
+        - vis # visibility
+        - si10 # 10-metre wind speed
+        - wdir10 # 10-metre wind direction
+        - prate # precipitation rate
+        # These variables exist in CEDA training data but not in the live MetOffice live service
+        - hcct # height of convective cloud top, meters above surface. NaN if no clouds
+        - cdcb # height of lowest cloud base > 3 oktas
+        - dpt # dew point temperature
+        - prmsl # mean sea level pressure
+        - h # geometrical? (maybe geopotential?) height
+      image_size_pixels_height: 24
+      image_size_pixels_width: 24
+      dropout_timedeltas_minutes: [-360]
+      dropout_fraction: 1.0 # Fraction of samples with dropout
       max_staleness_minutes: null
 
   satellite:
-    satellite_zarr_path:
-      # Path(s) to non-HRV satellite data in zarr format
-      # e.g. gs://solar-pv-nowcasting-data/satellite/EUMETSAT/SEVIRI_RSS/v4/2020_nonhrv.zarr
-      - PLACEHOLDER.zarr
-    history_minutes: 90
-    forecast_minutes: 0 # Deprecated for most use cases
-    live_delay_minutes: 60 # Only data up to time t0-60minutes is inluced in slice
+    # Path to Satellite data (non-HRV) in zarr format
+    # e.g. gs://solar-pv-nowcasting-data/satellite/EUMETSAT/SEVIRI_RSS/v4/2020_nonhrv.zarr
+    zarr_path: PLACEHOLDER.zarr
+    interval_start_minutes: -30
+    interval_end_minutes: 0
     time_resolution_minutes: 5
-    satellite_channels:
-      # Uses for each channel taken from https://resources.eumetrain.org/data/3/311/bsc_s4.pdf
+    channels:
       - IR_016 # Surface, cloud phase
       - IR_039 # Surface, clouds, wind fields
       - IR_087 # Surface, clouds, atmospheric instability
@@ -164,9 +126,7 @@ input_data:
       - VIS008 # Surface, clouds, wind fields
       - WV_062 # Water vapor, high level clouds, upper air analysis
       - WV_073 # Water vapor, atmospheric instability, upper-level dynamics
-    satellite_image_size_pixels_height: 24
-    satellite_image_size_pixels_width: 24
-    # A random value from the list below will be chosen as the delay when dropout is used
-    # If set to null no dropout is applied. Values must be negative.
+    image_size_pixels_height: 24
+    image_size_pixels_width: 24
     dropout_timedeltas_minutes: null
     dropout_fraction: 0 # Fraction of samples with dropout
@@ -1,8 +1,10 @@
 _target_: pvnet.data.datamodule.DataModule
 configuration: null
-# The batch_dir is the location batches were saved to using the save_batches.py script
-# The batch_dir should contain train and val subdirectories with batches
-batch_dir: "PLACEHOLDER"
+
+# The sample_dir is the location batches were saved to using the save_batches.py script
+# The sample_dir should contain train and val subdirectories with batches
+
+sample_dir: "PLACEHOLDER"
 num_workers: 10
 prefetch_factor: 2
 batch_size: 8
@@ -2,21 +2,19 @@ _target_: pvnet.data.datamodule.DataModule
 # Path to the data configuration yaml file. You can find examples in the configuration subdirectory
 # in configs.example/datamodule/configuration
 # Use the full local path such as: /FULL/PATH/PVNet/configs/datamodule/configuration/gcp_configuration.yaml"
+
 configuration: "PLACEHOLDER.yaml"
 num_workers: 20
 prefetch_factor: 2
 batch_size: 8
-batch_output_dir: "PLACEHOLDER"
-num_train_batches: 2
-num_val_batches: 1
 
+sample_output_dir: "PLACEHOLDER"
+num_train_samples: 2
+num_val_samples: 1
 
 train_period:
   - null
   - "2022-05-07"
 val_period:
   - "2022-05-08"
   - "2023-05-08"
-test_period:
-  - "2022-05-08"
-  - "2023-05-08"
@@ -1,2 +1,3 @@
 """Data parts"""
-from .utils import BatchSplitter
+from .site_datamodule import SiteDataModule
+from .uk_regional_datamodule import DataModule
Original file line number	Diff line number	Diff line change
`@@ -140,3 +140,4 @@ dmypy.json`
`140`	`140`
`141`	`141`	`# Pyre type checker`
`142`	`142`	`.pyre/`
	`143`	`+.DS_Store`
Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`	`1`	`include *.txt`
	`2`	`+recursive-include pvnet/models/model_cards *.md`