[#8] saving netcdf specific metadata to json file - initial work

CUAHSI · Mar 13, 2024 · e702fe6 · e702fe6
1 parent 8292dcd
commit e702fe6
Show file tree

Hide file tree

Showing 5 changed files with 206 additions and 7 deletions.
diff --git a/hsextract/adapters/hydroshare.py b/hsextract/adapters/hydroshare.py
@@ -1,4 +1,3 @@
-import requests
 from datetime import datetime
 from typing import Any, List, Optional, Union
 from pydantic import BaseModel, EmailStr, HttpUrl
@@ -258,3 +257,78 @@ def to_catalog_dataset(self):
         dataset.license = self.to_dataset_license()
         dataset.citation = [self.citation]
         return dataset
+
+
+class Variable(BaseModel):
+    name: str
+    descriptive_name: Optional[str]
+    unit: str
+    type: str
+    shape: str
+    method: Optional[str]
+
+    def to_aggregation_variable(self):
+        _property_value = schema.PropertyValue.construct()
+        _property_value.name = self.name
+        _property_value.unitCode = self.unit
+        _property_value.description = self.descriptive_name
+        _property_value.measurementTechnique = self.method
+        # creating a nested PropertyValue object to account for the shape field of the extracted variable metadata
+        _property_value.value = schema.PropertyValue.construct()
+        _property_value.value.name = "shape"
+        _property_value.value.unitCode = self.type
+        _property_value.value.value = self.shape
+        return _property_value
+
+
+class NetCDFAggregationMetadataAdapter:
+    @staticmethod
+    def to_catalog_record(aggr_metadata: dict):
+        """Converts extracted netcdf aggregation metadata to a catalog dataset record"""
+        nc_aggr_model = _NetCDFAggregationMetadata(**aggr_metadata)
+        return nc_aggr_model.to_catalog_dataset()
+
+
+class _NetCDFAggregationMetadata(BaseModel):
+    title: str
+    abstract: str
+    subjects: Optional[List[str]]
+    variables: List[Variable]
+    spatial_coverage: Optional[Union[SpatialCoverageBox, SpatialCoveragePoint]]
+    period_coverage: Optional[TemporalCoverage]
+    content_files: Optional[List[ContentFile]]
+    # the extracted file (media object) metadata is already in schema.MediaObject format
+    associatedMedia: Optional[List[schema.MediaObject]]
+
+    def to_aggregation_associated_media(self):
+        media_objects = []
+        for content_file in self.content_files:
+            media_objects.append(content_file.to_dataset_media_object())
+        return media_objects
+
+    def to_aggregation_spatial_coverage(self):
+        if self.spatial_coverage:
+            return self.spatial_coverage.to_dataset_spatial_coverage()
+        return None
+
+    def to_aggregation_period_coverage(self):
+        if self.period_coverage:
+            return self.period_coverage.to_dataset_temporal_coverage()
+        return None
+
+    def to_aggregation_keywords(self):
+        if self.subjects:
+            return self.subjects
+        return None
+
+    def to_catalog_dataset(self):
+        aggregation_metadata = schema.NetCDFAggregationMetadata.construct()
+        aggregation_metadata.name = self.title
+        aggregation_metadata.description = self.abstract
+        aggregation_metadata.keywords = self.to_aggregation_keywords()
+        aggregation_metadata.spatialCoverage = self.to_aggregation_spatial_coverage()
+        aggregation_metadata.temporalCoverage = self.to_aggregation_period_coverage()
+        aggregation_metadata.variableMeasured = [v.to_aggregation_variable() for v in self.variables]
+        aggregation_metadata.additionalProperty = []
+        aggregation_metadata.associatedMedia = self.associatedMedia
+        return aggregation_metadata
diff --git a/hsextract/file_utils.py b/hsextract/file_utils.py
@@ -14,4 +14,12 @@ def file_metadata(path: str):
     checksum = d.hexdigest()
     size = f"{os.path.getsize(path)/1000.00} KB"
     mime_type = mimetypes.guess_type(path)[0]
-    return {"contentUrl": path, "contentSize": size, "sha256": checksum, "encodingFormat": mime_type}, None
+    file_name = os.path.basename(path)
+    file_meta = {
+        "contentUrl": path,
+        "contentSize": size,
+        "sha256": checksum,
+        "encodingFormat": mime_type,
+        "name": file_name
+    }
+    return file_meta, None
diff --git a/hsextract/models/schema.py b/hsextract/models/schema.py
@@ -333,7 +333,7 @@ class MediaObject(SchemaBaseModel):
         "unit of measurement.",
     )
     name: str = Field(description="The name of the media object (file).")
-    checksum: str = Field(description="The MD5 checksum of the file")
+    sha256: Optional[str] = Field(title="SHA-256", description="The SHA-256 hash of the media object.")
 
     @validator('contentSize')
     def validate_content_size(cls, v):
@@ -363,6 +363,53 @@ def validate_content_size(cls, v):
         return v
 
 
+class PropertyValueBase(SchemaBaseModel):
+    type: str = Field(
+        alias="@type",
+        default="PropertyValue",
+        const="PropertyValue",
+        description="A property-value pair.",
+    )
+    propertyID: Optional[str] = Field(
+        title="Property ID", description="The ID of the property."
+    )
+    name: str = Field(description="The name of the property.")
+    value: str = Field(description="The value of the property.")
+    unitCode: Optional[str] = Field(
+        title="Measurement unit", description="The unit of measurement for the value."
+    )
+    description: Optional[str] = Field(description="A description of the property.")
+    minValue: Optional[float] = Field(
+        title="Minimum value", description="The minimum allowed value for the property."
+    )
+    maxValue: Optional[float] = Field(
+        title="Maximum value", description="The maximum allowed value for the property."
+    )
+    measurementTechnique: Optional[str] = Field(
+        title="Measurement technique", description="A technique or technology used in a measurement."
+    )
+
+    class Config:
+        title = "PropertyValue"
+
+    @root_validator
+    def validate_min_max_values(cls, values):
+        min_value = values.get("minValue", None)
+        max_value = values.get("maxValue", None)
+        if min_value is not None and max_value is not None:
+            if min_value > max_value:
+                raise ValueError("Minimum value must be less than or equal to maximum value")
+
+        return values
+
+
+class PropertyValue(PropertyValueBase):
+    # using PropertyValueBase model instead of PropertyValue model as one of the types for the value field
+    # in order for the schema generation (schema.json) to work. Self referencing nested models leads to
+    # infinite loop in our custom schema generation code when trying to replace dict with key '$ref'
+    value: Union[str, PropertyValueBase, List[PropertyValueBase]] = Field(description="The value of the property.")
+
+
 class CoreMetadata(SchemaBaseModel):
     context: HttpUrl = Field(
         alias='@context',
@@ -468,9 +515,20 @@ class CoreMetadata(SchemaBaseModel):
     citation: Optional[List[str]] = Field(title="Citation", description="A bibliographic citation for the resource.")
 
 
-class DatasetSchema(CoreMetadata):
+class DatasetMetadata(CoreMetadata):
     # used only for generating the JSON-LD schema for a dataset.
-    pass
+    variableMeasured: Optional[List[Union[str, PropertyValue]]] = Field(
+        title="Variables measured", description="Measured variables."
+    )
+    additionalProperty: Optional[List[PropertyValue]] = Field(
+        title="Additional properties",
+        default=[],
+        description="Additional properties of the Dataset."
+    )
+    sourceOrganization: Optional[Organization] = Field(
+        title="Source organization",
+        description="The organization that provided the data for this dataset."
+    )
 
 
 class CoreMetadataDOC(CoreMetadata):
@@ -485,3 +543,59 @@ class Settings:
                 year=dt.year, month=dt.month, day=dt.day, hour=dt.hour, minute=dt.minute, second=dt.second
             ),
         }
+
+
+class DatasetMetadataDOC(CoreMetadataDOC, DatasetMetadata):
+    pass
+
+
+class BaseAggregationMetadata(BaseModel):
+    """Base class for aggregation metadata - used for metadata view in UI."""
+    type: str = Field(
+        alias="@type",
+        default="Aggregation",
+        const=True,
+        description="Type of aggregation."
+    )
+    name: Optional[str] = Field(
+        description="A text string with a descriptive name or title for the aggregation."
+    )
+    description: Optional[str] = Field(
+        description="A text string containing a description/abstract for the aggregation."
+    )
+    keywords: List[str] = Field(
+        min_items=0,
+        description="Keywords or tags used to describe the dataset, delimited by commas."
+    )
+    associatedMedia: Optional[List[MediaObject]] = Field(
+        title="Aggregation content",
+        description="A media object that encodes this aggregation."
+    )
+    spatialCoverage: Optional[Place] = Field(
+        description="The spatialCoverage of a CreativeWork indicates the place(s) which are the focus of the content. "
+                    "It is a sub property of contentLocation intended primarily for more technical and "
+                    "detailed materials. For example with a Dataset, it indicates areas that the dataset "
+                    "describes: a dataset of New York weather would have spatialCoverage which was the "
+                    "place: the state of New York."
+    )
+    temporalCoverage: Optional[TemporalCoverage] = Field(
+        title="Temporal coverage",
+        description="The time period that applies to all of the content within the aggregation."
+    )
+    additionalProperty: Optional[List[PropertyValue]] = Field(
+        title="Additional properties",
+        default=[],
+        description="Additional properties of the aggregation."
+    )
+
+
+class NetCDFAggregationMetadata(BaseAggregationMetadata):
+    type: str = Field(
+        alias="@type",
+        default="Multidimensional Dataset",
+        const=True,
+        description="Type of aggregation."
+    )
+    variableMeasured: Optional[List[Union[str, PropertyValue]]] = Field(
+        title="Variables measured", description="Measured variables."
+    )
diff --git a/hsextract/netcdf/utils.py b/hsextract/netcdf/utils.py
@@ -58,12 +58,13 @@ def get_nc_meta_dict(nc_file_name):
     else:
         nc_dataset = get_nc_dataset(nc_file_name)
 
+    if nc_dataset is None:
+        raise ValueError(f"The file:{nc_file_name} is not valid netcdf file.")
     res_dublin_core_meta = get_dublin_core_meta(nc_dataset)
     res_type_specific_meta = get_type_specific_meta(nc_dataset)
     nc_dataset.close()
 
     md = combine_metadata(res_dublin_core_meta, res_type_specific_meta)
-
     md["content_files"] = [nc_file_name]
     return md
 

diff --git a/hsextract/utils.py b/hsextract/utils.py
@@ -5,7 +5,7 @@
 
 from pathlib import Path
 
-from hsextract.adapters.hydroshare import HydroshareMetadataAdapter
+from hsextract.adapters.hydroshare import HydroshareMetadataAdapter, NetCDFAggregationMetadataAdapter
 from hsextract.listing.utils import prepare_files
 from hsextract.models.schema import CoreMetadataDOC
 from hsextract.raster.utils import extract_from_tif_file
@@ -50,6 +50,8 @@ def extract_metadata(type: str, filepath):
         return json.loads(CoreMetadataDOC.construct(**extracted_metadata).json())
     else:
         extracted_metadata["associatedMedia"] = all_file_metadata
+        if type == "netcdf":
+            adapter = NetCDFAggregationMetadataAdapter()
         catalog_record = json.loads(adapter.to_catalog_record(extracted_metadata).json())
         return catalog_record