From e702fe62a5004db2c83bf75d483ec001092ae921 Mon Sep 17 00:00:00 2001 From: pkdash Date: Wed, 13 Mar 2024 18:35:26 -0400 Subject: [PATCH] [#8] saving netcdf specific metadata to json file - initial work --- hsextract/adapters/hydroshare.py | 76 +++++++++++++++++++- hsextract/file_utils.py | 10 ++- hsextract/models/schema.py | 120 ++++++++++++++++++++++++++++++- hsextract/netcdf/utils.py | 3 +- hsextract/utils.py | 4 +- 5 files changed, 206 insertions(+), 7 deletions(-) diff --git a/hsextract/adapters/hydroshare.py b/hsextract/adapters/hydroshare.py index b81bcf2..d0b9def 100644 --- a/hsextract/adapters/hydroshare.py +++ b/hsextract/adapters/hydroshare.py @@ -1,4 +1,3 @@ -import requests from datetime import datetime from typing import Any, List, Optional, Union from pydantic import BaseModel, EmailStr, HttpUrl @@ -258,3 +257,78 @@ def to_catalog_dataset(self): dataset.license = self.to_dataset_license() dataset.citation = [self.citation] return dataset + + +class Variable(BaseModel): + name: str + descriptive_name: Optional[str] + unit: str + type: str + shape: str + method: Optional[str] + + def to_aggregation_variable(self): + _property_value = schema.PropertyValue.construct() + _property_value.name = self.name + _property_value.unitCode = self.unit + _property_value.description = self.descriptive_name + _property_value.measurementTechnique = self.method + # creating a nested PropertyValue object to account for the shape field of the extracted variable metadata + _property_value.value = schema.PropertyValue.construct() + _property_value.value.name = "shape" + _property_value.value.unitCode = self.type + _property_value.value.value = self.shape + return _property_value + + +class NetCDFAggregationMetadataAdapter: + @staticmethod + def to_catalog_record(aggr_metadata: dict): + """Converts extracted netcdf aggregation metadata to a catalog dataset record""" + nc_aggr_model = _NetCDFAggregationMetadata(**aggr_metadata) + return nc_aggr_model.to_catalog_dataset() + + +class _NetCDFAggregationMetadata(BaseModel): + title: str + abstract: str + subjects: Optional[List[str]] + variables: List[Variable] + spatial_coverage: Optional[Union[SpatialCoverageBox, SpatialCoveragePoint]] + period_coverage: Optional[TemporalCoverage] + content_files: Optional[List[ContentFile]] + # the extracted file (media object) metadata is already in schema.MediaObject format + associatedMedia: Optional[List[schema.MediaObject]] + + def to_aggregation_associated_media(self): + media_objects = [] + for content_file in self.content_files: + media_objects.append(content_file.to_dataset_media_object()) + return media_objects + + def to_aggregation_spatial_coverage(self): + if self.spatial_coverage: + return self.spatial_coverage.to_dataset_spatial_coverage() + return None + + def to_aggregation_period_coverage(self): + if self.period_coverage: + return self.period_coverage.to_dataset_temporal_coverage() + return None + + def to_aggregation_keywords(self): + if self.subjects: + return self.subjects + return None + + def to_catalog_dataset(self): + aggregation_metadata = schema.NetCDFAggregationMetadata.construct() + aggregation_metadata.name = self.title + aggregation_metadata.description = self.abstract + aggregation_metadata.keywords = self.to_aggregation_keywords() + aggregation_metadata.spatialCoverage = self.to_aggregation_spatial_coverage() + aggregation_metadata.temporalCoverage = self.to_aggregation_period_coverage() + aggregation_metadata.variableMeasured = [v.to_aggregation_variable() for v in self.variables] + aggregation_metadata.additionalProperty = [] + aggregation_metadata.associatedMedia = self.associatedMedia + return aggregation_metadata diff --git a/hsextract/file_utils.py b/hsextract/file_utils.py index 3f53a1f..370083b 100644 --- a/hsextract/file_utils.py +++ b/hsextract/file_utils.py @@ -14,4 +14,12 @@ def file_metadata(path: str): checksum = d.hexdigest() size = f"{os.path.getsize(path)/1000.00} KB" mime_type = mimetypes.guess_type(path)[0] - return {"contentUrl": path, "contentSize": size, "sha256": checksum, "encodingFormat": mime_type}, None + file_name = os.path.basename(path) + file_meta = { + "contentUrl": path, + "contentSize": size, + "sha256": checksum, + "encodingFormat": mime_type, + "name": file_name + } + return file_meta, None diff --git a/hsextract/models/schema.py b/hsextract/models/schema.py index 5aa53b3..5146094 100644 --- a/hsextract/models/schema.py +++ b/hsextract/models/schema.py @@ -333,7 +333,7 @@ class MediaObject(SchemaBaseModel): "unit of measurement.", ) name: str = Field(description="The name of the media object (file).") - checksum: str = Field(description="The MD5 checksum of the file") + sha256: Optional[str] = Field(title="SHA-256", description="The SHA-256 hash of the media object.") @validator('contentSize') def validate_content_size(cls, v): @@ -363,6 +363,53 @@ def validate_content_size(cls, v): return v +class PropertyValueBase(SchemaBaseModel): + type: str = Field( + alias="@type", + default="PropertyValue", + const="PropertyValue", + description="A property-value pair.", + ) + propertyID: Optional[str] = Field( + title="Property ID", description="The ID of the property." + ) + name: str = Field(description="The name of the property.") + value: str = Field(description="The value of the property.") + unitCode: Optional[str] = Field( + title="Measurement unit", description="The unit of measurement for the value." + ) + description: Optional[str] = Field(description="A description of the property.") + minValue: Optional[float] = Field( + title="Minimum value", description="The minimum allowed value for the property." + ) + maxValue: Optional[float] = Field( + title="Maximum value", description="The maximum allowed value for the property." + ) + measurementTechnique: Optional[str] = Field( + title="Measurement technique", description="A technique or technology used in a measurement." + ) + + class Config: + title = "PropertyValue" + + @root_validator + def validate_min_max_values(cls, values): + min_value = values.get("minValue", None) + max_value = values.get("maxValue", None) + if min_value is not None and max_value is not None: + if min_value > max_value: + raise ValueError("Minimum value must be less than or equal to maximum value") + + return values + + +class PropertyValue(PropertyValueBase): + # using PropertyValueBase model instead of PropertyValue model as one of the types for the value field + # in order for the schema generation (schema.json) to work. Self referencing nested models leads to + # infinite loop in our custom schema generation code when trying to replace dict with key '$ref' + value: Union[str, PropertyValueBase, List[PropertyValueBase]] = Field(description="The value of the property.") + + class CoreMetadata(SchemaBaseModel): context: HttpUrl = Field( alias='@context', @@ -468,9 +515,20 @@ class CoreMetadata(SchemaBaseModel): citation: Optional[List[str]] = Field(title="Citation", description="A bibliographic citation for the resource.") -class DatasetSchema(CoreMetadata): +class DatasetMetadata(CoreMetadata): # used only for generating the JSON-LD schema for a dataset. - pass + variableMeasured: Optional[List[Union[str, PropertyValue]]] = Field( + title="Variables measured", description="Measured variables." + ) + additionalProperty: Optional[List[PropertyValue]] = Field( + title="Additional properties", + default=[], + description="Additional properties of the Dataset." + ) + sourceOrganization: Optional[Organization] = Field( + title="Source organization", + description="The organization that provided the data for this dataset." + ) class CoreMetadataDOC(CoreMetadata): @@ -485,3 +543,59 @@ class Settings: year=dt.year, month=dt.month, day=dt.day, hour=dt.hour, minute=dt.minute, second=dt.second ), } + + +class DatasetMetadataDOC(CoreMetadataDOC, DatasetMetadata): + pass + + +class BaseAggregationMetadata(BaseModel): + """Base class for aggregation metadata - used for metadata view in UI.""" + type: str = Field( + alias="@type", + default="Aggregation", + const=True, + description="Type of aggregation." + ) + name: Optional[str] = Field( + description="A text string with a descriptive name or title for the aggregation." + ) + description: Optional[str] = Field( + description="A text string containing a description/abstract for the aggregation." + ) + keywords: List[str] = Field( + min_items=0, + description="Keywords or tags used to describe the dataset, delimited by commas." + ) + associatedMedia: Optional[List[MediaObject]] = Field( + title="Aggregation content", + description="A media object that encodes this aggregation." + ) + spatialCoverage: Optional[Place] = Field( + description="The spatialCoverage of a CreativeWork indicates the place(s) which are the focus of the content. " + "It is a sub property of contentLocation intended primarily for more technical and " + "detailed materials. For example with a Dataset, it indicates areas that the dataset " + "describes: a dataset of New York weather would have spatialCoverage which was the " + "place: the state of New York." + ) + temporalCoverage: Optional[TemporalCoverage] = Field( + title="Temporal coverage", + description="The time period that applies to all of the content within the aggregation." + ) + additionalProperty: Optional[List[PropertyValue]] = Field( + title="Additional properties", + default=[], + description="Additional properties of the aggregation." + ) + + +class NetCDFAggregationMetadata(BaseAggregationMetadata): + type: str = Field( + alias="@type", + default="Multidimensional Dataset", + const=True, + description="Type of aggregation." + ) + variableMeasured: Optional[List[Union[str, PropertyValue]]] = Field( + title="Variables measured", description="Measured variables." + ) diff --git a/hsextract/netcdf/utils.py b/hsextract/netcdf/utils.py index 190e29b..f406a56 100644 --- a/hsextract/netcdf/utils.py +++ b/hsextract/netcdf/utils.py @@ -58,12 +58,13 @@ def get_nc_meta_dict(nc_file_name): else: nc_dataset = get_nc_dataset(nc_file_name) + if nc_dataset is None: + raise ValueError(f"The file:{nc_file_name} is not valid netcdf file.") res_dublin_core_meta = get_dublin_core_meta(nc_dataset) res_type_specific_meta = get_type_specific_meta(nc_dataset) nc_dataset.close() md = combine_metadata(res_dublin_core_meta, res_type_specific_meta) - md["content_files"] = [nc_file_name] return md diff --git a/hsextract/utils.py b/hsextract/utils.py index 4144d95..bccbfe5 100644 --- a/hsextract/utils.py +++ b/hsextract/utils.py @@ -5,7 +5,7 @@ from pathlib import Path -from hsextract.adapters.hydroshare import HydroshareMetadataAdapter +from hsextract.adapters.hydroshare import HydroshareMetadataAdapter, NetCDFAggregationMetadataAdapter from hsextract.listing.utils import prepare_files from hsextract.models.schema import CoreMetadataDOC from hsextract.raster.utils import extract_from_tif_file @@ -50,6 +50,8 @@ def extract_metadata(type: str, filepath): return json.loads(CoreMetadataDOC.construct(**extracted_metadata).json()) else: extracted_metadata["associatedMedia"] = all_file_metadata + if type == "netcdf": + adapter = NetCDFAggregationMetadataAdapter() catalog_record = json.loads(adapter.to_catalog_record(extracted_metadata).json()) return catalog_record