openclimatefix · devsjc · Jan 7, 2025 · Jan 2, 2025 · Jan 2, 2025 · Jan 2, 2025
diff --git a/src/nwp_consumer/internal/entities/coordinates.py b/src/nwp_consumer/internal/entities/coordinates.py
@@ -90,7 +90,10 @@ class NWPDimensionCoordinateMap:
     Will be truncated to 4 decimal places, and ordered as 90 -> -90.
     """
     longitude: list[float] | None = None
-    """The longitude coordinates of the forecast grid in degrees.
+    """The longitude coordinates of the forecast grid in degrees. """
+    maximum_number_of_chunks_in_one_dim: int = 8
+    """ The maximum number of chunks in one dimension.
+    When saving to S3 we might want this to be small, to reduce the number of files saved.
 
     Will be truncated to 4 decimal places, and ordered as -180 -> 180.
     """
@@ -116,7 +119,9 @@ def dims(self) -> list[str]:
         Ignores any dimensions that do not have a corresponding coordinate
         index value list.
         """
-        return [f.name for f in dataclasses.fields(self) if getattr(self, f.name) is not None]
+        return [f.name for f in dataclasses.fields(self) if
+                getattr(self, f.name) is not None
+                and f.name != "maximum_number_of_chunks_in_one_dim"]
 
     @property
     def shapemap(self) -> dict[str, int]:
@@ -409,7 +414,8 @@ def default_chunking(self) -> dict[str, int]:
             "init_time": 1,
             "step": 1,
         } | {
-            dim: len(getattr(self, dim)) // 8 if len(getattr(self, dim)) > 8 else 1
+            dim: len(getattr(self, dim)) // self.maximum_number_of_chunks_in_one_dim
+            if len(getattr(self, dim)) > self.maximum_number_of_chunks_in_one_dim else 1
             for dim in self.dims
             if dim not in ["init_time", "step"]
         }

diff --git a/src/nwp_consumer/internal/entities/modelmetadata.py b/src/nwp_consumer/internal/entities/modelmetadata.py
@@ -93,6 +93,13 @@ def with_region(self, region: str) -> "ModelMetadata":
                 log.warning(f"Unknown region '{region}', not cropping expected coordinates.")
                 return self
 
+    def set_maximum_number_of_chunks_in_one_dim(self, maximum_number_of_chunks_in_one_dim: int) \
+            -> "ModelMetadata":
+        """Set the maximum number of chunks in one dimension."""
+        self.expected_coordinates.maximum_number_of_chunks_in_one_dim \
+            = maximum_number_of_chunks_in_one_dim
+        return self
+
 
 class Models:
     """Namespace containing known models."""

diff --git a/src/nwp_consumer/internal/repositories/raw_repositories/ecmwf_realtime.py b/src/nwp_consumer/internal/repositories/raw_repositories/ecmwf_realtime.py
@@ -83,8 +83,10 @@ def repository() -> entities.RawRepositoryMetadata:
             },
             postprocess_options=entities.PostProcessOptions(),
             available_models={
-                "default": entities.Models.ECMWF_HRES_IFS_0P1DEGREE.with_region("uk"),
-                "hres-ifs-uk": entities.Models.ECMWF_HRES_IFS_0P1DEGREE.with_region("uk"),
+                "default": entities.Models.ECMWF_HRES_IFS_0P1DEGREE.with_region("uk")
+                .set_maximum_number_of_chunks_in_one_dim(2),
+                "hres-ifs-uk": entities.Models.ECMWF_HRES_IFS_0P1DEGREE.with_region("uk")
+                .set_maximum_number_of_chunks_in_one_dim(2),
                 "hres-ifs-india": entities.Models.ECMWF_HRES_IFS_0P1DEGREE.with_region("india"),
             },
         )
@@ -194,6 +196,7 @@ def _download(self, url: str) -> ResultE[pathlib.Path]:
         ).with_suffix(".grib").expanduser()
 
         # Only download the file if not already present
+        log.info("Checking for local file: '%s'", local_path)
         if not local_path.exists() or local_path.stat().st_size == 0:
             local_path.parent.mkdir(parents=True, exist_ok=True)
             log.debug("Requesting file from S3 at: '%s'", url)
@@ -203,6 +206,7 @@ def _download(self, url: str) -> ResultE[pathlib.Path]:
                     raise FileNotFoundError(f"File not found at '{url}'")
 
                 with local_path.open("wb") as lf, self._fs.open(url, "rb") as rf:
+                    log.info(f"Writing file from {url} to {local_path}")
                     for chunk in iter(lambda: rf.read(12 * 1024), b""):
                         lf.write(chunk)
                         lf.flush()
@@ -280,6 +284,7 @@ def _convert(path: pathlib.Path) -> ResultE[list[xr.DataArray]]:
                     .sortby(variables=["step", "variable", "longitude"])
                     .sortby(variables="latitude", ascending=False)
                 )
+
             except Exception as e:
                 return Failure(ValueError(
                     f"Error processing dataset {i} from '{path}' to DataArray: {e}",

diff --git a/src/nwp_consumer/internal/services/consumer_service.py b/src/nwp_consumer/internal/services/consumer_service.py
@@ -107,13 +107,18 @@ def _parallelize_generator[T](
         """
         # TODO: Change this based on threads instead of CPU count
         n_jobs: int = max(cpu_count() - 1, max_connections)
-        if os.getenv("CONCURRENCY", "True").capitalize() == "False":
+        prefer = "threads"
+
+        concurrency = os.getenv("CONCURRENCY", "True").capitalize() == "False"
+        if concurrency:
             n_jobs = 1
-        log.debug(f"Using {n_jobs} concurrent thread(s)")
+            prefer = "processes"
+
+        log.debug(f"Using {n_jobs} concurrent {prefer}")
 
         return Parallel(  # type: ignore
             n_jobs=n_jobs,
-            prefer="threads",
+            prefer=prefer,
             verbose=0,
             return_as="generator_unordered",
         )(delayed_generator)