Skip to content

Commit 07ee51b

Browse files
authored
Merge pull request #56 from openclimatefix/feature/issue-31
Feature/issue 31
2 parents 53a29da + 4fd0f44 commit 07ee51b

File tree

2 files changed

+112
-19
lines changed

2 files changed

+112
-19
lines changed

src/open_data_pvnet/main.py

Lines changed: 37 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import logging
33
from open_data_pvnet.scripts.archive import handle_archive
44
from open_data_pvnet.utils.env_loader import load_environment_variables
5-
from open_data_pvnet.utils.data_downloader import load_zarr_data
5+
from open_data_pvnet.utils.data_downloader import load_zarr_data, load_zarr_data_for_day
66
from pathlib import Path
77
import concurrent.futures
88
from typing import List, Tuple
@@ -76,7 +76,12 @@ def _add_common_arguments(parser, provider_name):
7676
"""Add arguments common to both archive and load operations."""
7777
parser.add_argument("--year", type=int, required=True, help="Year of data")
7878
parser.add_argument("--month", type=int, required=True, help="Month of data")
79-
parser.add_argument("--day", type=int, required=True, help="Day of data")
79+
parser.add_argument(
80+
"--day",
81+
type=int,
82+
help="Day of data (optional - if not provided, loads entire month)",
83+
default=None,
84+
)
8085

8186
# Add Met Office specific arguments
8287
if provider_name == "metoffice":
@@ -114,28 +119,41 @@ def parse_chunks(chunks_str):
114119

115120
def handle_load(provider: str, year: int, month: int, day: int, **kwargs):
116121
"""Handle loading archived data."""
117-
hour = kwargs.get("hour", 0) # Default to hour 0 if not specified
118122
chunks = parse_chunks(kwargs.get("chunks"))
119123
remote = kwargs.get("remote", False)
124+
hour = kwargs.get("hour")
120125

121-
# Construct the archive path based on provider and parameters
122-
# Format: data/2023/01/16/2023-01-16-00.zarr.zip
123-
archive_path = (
124-
Path("data")
125-
/ str(year)
126-
/ f"{month:02d}"
127-
/ f"{day:02d}"
128-
/ f"{year}-{month:02d}-{day:02d}-{hour:02d}.zarr.zip"
129-
)
126+
# Base path for the data
127+
base_path = Path("data") / str(year) / f"{month:02d}" / f"{day:02d}"
130128

131129
try:
132-
dataset = load_zarr_data(
133-
archive_path,
134-
chunks=chunks,
135-
remote=remote,
136-
download=not remote, # Don't try to download if remote=True
137-
)
138-
logger.info(f"Successfully loaded dataset for {year}-{month:02d}-{day:02d} hour {hour:02d}")
130+
if hour is not None:
131+
# Load specific hour
132+
archive_path = base_path / f"{year}-{month:02d}-{day:02d}-{hour:02d}.zarr.zip"
133+
dataset = load_zarr_data(
134+
archive_path,
135+
chunks=chunks,
136+
remote=remote,
137+
download=not remote,
138+
)
139+
logger.info(
140+
f"Successfully loaded dataset for {year}-{month:02d}-{day:02d} hour {hour:02d}"
141+
)
142+
else:
143+
# Load all hours for the day
144+
dataset = load_zarr_data_for_day(
145+
base_path,
146+
year,
147+
month,
148+
day,
149+
chunks=chunks,
150+
remote=remote,
151+
download=not remote,
152+
)
153+
logger.info(
154+
f"Successfully loaded all available datasets for {year}-{month:02d}-{day:02d}"
155+
)
156+
139157
return dataset
140158
except Exception as e:
141159
logger.error(f"Error loading dataset: {e}")

src/open_data_pvnet/utils/data_downloader.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,3 +188,78 @@ def load_zarr_data(
188188
except Exception as e:
189189
logger.error(f"Error loading zarr dataset: {e}")
190190
raise
191+
192+
193+
def load_zarr_data_for_day( # noqa: C901
194+
base_path: Path, year: int, month: int, day: int, chunks=None, remote=False, download=True
195+
):
196+
"""Load and merge all hourly Zarr datasets for a given day."""
197+
datasets = []
198+
stores = [] # Keep track of stores to close them later
199+
200+
try:
201+
for hour in range(24):
202+
archive_path = base_path / f"{year}-{month:02d}-{day:02d}-{hour:02d}.zarr.zip"
203+
try:
204+
if remote:
205+
dataset = _load_remote_zarr(
206+
get_hf_url(archive_path),
207+
chunks=chunks,
208+
consolidated=False,
209+
restructure=True,
210+
)
211+
else:
212+
if not archive_path.exists() and download:
213+
download_from_hf(str(archive_path), archive_path)
214+
215+
logger.info(f"Opening zarr store from {archive_path}")
216+
logger.info(f"File size: {archive_path.stat().st_size / (1024*1024):.2f} MB")
217+
218+
store = zarr.storage.ZipStore(str(archive_path), mode="r")
219+
stores.append(store) # Keep track of the store
220+
221+
zarr_groups = get_zarr_groups(store)
222+
hour_datasets = []
223+
224+
for group in zarr_groups:
225+
try:
226+
group_ds = open_zarr_group(store, group, chunks, False)
227+
hour_datasets.append(group_ds)
228+
except Exception as e:
229+
logger.warning(f"Could not open group {group}: {e}")
230+
continue
231+
232+
if not hour_datasets:
233+
raise ValueError("No valid datasets found in the Zarr store")
234+
235+
dataset = merge_datasets(hour_datasets)
236+
dataset = restructure_dataset(dataset)
237+
238+
datasets.append(dataset)
239+
logger.info(
240+
f"Successfully loaded dataset for {year}-{month:02d}-{day:02d} hour {hour:02d}"
241+
)
242+
243+
except Exception as e:
244+
logger.warning(f"Could not load dataset for hour {hour}: {e}")
245+
continue
246+
247+
if not datasets:
248+
raise ValueError(f"No datasets could be loaded for {year}-{month:02d}-{day:02d}")
249+
250+
# Merge all datasets along the time dimension
251+
merged_dataset = xr.concat(datasets, dim="time")
252+
logger.info(f"Successfully merged {len(datasets)} hourly datasets")
253+
254+
# Load the merged dataset into memory before closing stores
255+
merged_dataset = merged_dataset.compute()
256+
257+
return merged_dataset
258+
259+
finally:
260+
# Close all stores in the finally block
261+
for store in stores:
262+
try:
263+
store.close()
264+
except Exception as e:
265+
logger.warning(f"Error closing store: {e}")

0 commit comments

Comments
 (0)