Skip to content

Commit

Permalink
[#8] saving netcdf specific metadata to json file - initial work
Browse files Browse the repository at this point in the history
  • Loading branch information
pkdash committed Mar 13, 2024
1 parent 8292dcd commit e702fe6
Show file tree
Hide file tree
Showing 5 changed files with 206 additions and 7 deletions.
76 changes: 75 additions & 1 deletion hsextract/adapters/hydroshare.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import requests
from datetime import datetime
from typing import Any, List, Optional, Union
from pydantic import BaseModel, EmailStr, HttpUrl
Expand Down Expand Up @@ -258,3 +257,78 @@ def to_catalog_dataset(self):
dataset.license = self.to_dataset_license()
dataset.citation = [self.citation]
return dataset


class Variable(BaseModel):
name: str
descriptive_name: Optional[str]
unit: str
type: str
shape: str
method: Optional[str]

def to_aggregation_variable(self):
_property_value = schema.PropertyValue.construct()
_property_value.name = self.name
_property_value.unitCode = self.unit
_property_value.description = self.descriptive_name
_property_value.measurementTechnique = self.method
# creating a nested PropertyValue object to account for the shape field of the extracted variable metadata
_property_value.value = schema.PropertyValue.construct()
_property_value.value.name = "shape"
_property_value.value.unitCode = self.type
_property_value.value.value = self.shape
return _property_value


class NetCDFAggregationMetadataAdapter:
@staticmethod
def to_catalog_record(aggr_metadata: dict):
"""Converts extracted netcdf aggregation metadata to a catalog dataset record"""
nc_aggr_model = _NetCDFAggregationMetadata(**aggr_metadata)
return nc_aggr_model.to_catalog_dataset()


class _NetCDFAggregationMetadata(BaseModel):
title: str
abstract: str
subjects: Optional[List[str]]
variables: List[Variable]
spatial_coverage: Optional[Union[SpatialCoverageBox, SpatialCoveragePoint]]
period_coverage: Optional[TemporalCoverage]
content_files: Optional[List[ContentFile]]
# the extracted file (media object) metadata is already in schema.MediaObject format
associatedMedia: Optional[List[schema.MediaObject]]

def to_aggregation_associated_media(self):
media_objects = []
for content_file in self.content_files:
media_objects.append(content_file.to_dataset_media_object())
return media_objects

def to_aggregation_spatial_coverage(self):
if self.spatial_coverage:
return self.spatial_coverage.to_dataset_spatial_coverage()
return None

def to_aggregation_period_coverage(self):
if self.period_coverage:
return self.period_coverage.to_dataset_temporal_coverage()
return None

def to_aggregation_keywords(self):
if self.subjects:
return self.subjects
return None

def to_catalog_dataset(self):
aggregation_metadata = schema.NetCDFAggregationMetadata.construct()
aggregation_metadata.name = self.title
aggregation_metadata.description = self.abstract
aggregation_metadata.keywords = self.to_aggregation_keywords()
aggregation_metadata.spatialCoverage = self.to_aggregation_spatial_coverage()
aggregation_metadata.temporalCoverage = self.to_aggregation_period_coverage()
aggregation_metadata.variableMeasured = [v.to_aggregation_variable() for v in self.variables]
aggregation_metadata.additionalProperty = []
aggregation_metadata.associatedMedia = self.associatedMedia
return aggregation_metadata
10 changes: 9 additions & 1 deletion hsextract/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,12 @@ def file_metadata(path: str):
checksum = d.hexdigest()
size = f"{os.path.getsize(path)/1000.00} KB"
mime_type = mimetypes.guess_type(path)[0]
return {"contentUrl": path, "contentSize": size, "sha256": checksum, "encodingFormat": mime_type}, None
file_name = os.path.basename(path)
file_meta = {
"contentUrl": path,
"contentSize": size,
"sha256": checksum,
"encodingFormat": mime_type,
"name": file_name
}
return file_meta, None
120 changes: 117 additions & 3 deletions hsextract/models/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,7 @@ class MediaObject(SchemaBaseModel):
"unit of measurement.",
)
name: str = Field(description="The name of the media object (file).")
checksum: str = Field(description="The MD5 checksum of the file")
sha256: Optional[str] = Field(title="SHA-256", description="The SHA-256 hash of the media object.")

@validator('contentSize')
def validate_content_size(cls, v):
Expand Down Expand Up @@ -363,6 +363,53 @@ def validate_content_size(cls, v):
return v


class PropertyValueBase(SchemaBaseModel):
type: str = Field(
alias="@type",
default="PropertyValue",
const="PropertyValue",
description="A property-value pair.",
)
propertyID: Optional[str] = Field(
title="Property ID", description="The ID of the property."
)
name: str = Field(description="The name of the property.")
value: str = Field(description="The value of the property.")
unitCode: Optional[str] = Field(
title="Measurement unit", description="The unit of measurement for the value."
)
description: Optional[str] = Field(description="A description of the property.")
minValue: Optional[float] = Field(
title="Minimum value", description="The minimum allowed value for the property."
)
maxValue: Optional[float] = Field(
title="Maximum value", description="The maximum allowed value for the property."
)
measurementTechnique: Optional[str] = Field(
title="Measurement technique", description="A technique or technology used in a measurement."
)

class Config:
title = "PropertyValue"

@root_validator
def validate_min_max_values(cls, values):
min_value = values.get("minValue", None)
max_value = values.get("maxValue", None)
if min_value is not None and max_value is not None:
if min_value > max_value:
raise ValueError("Minimum value must be less than or equal to maximum value")

return values


class PropertyValue(PropertyValueBase):
# using PropertyValueBase model instead of PropertyValue model as one of the types for the value field
# in order for the schema generation (schema.json) to work. Self referencing nested models leads to
# infinite loop in our custom schema generation code when trying to replace dict with key '$ref'
value: Union[str, PropertyValueBase, List[PropertyValueBase]] = Field(description="The value of the property.")


class CoreMetadata(SchemaBaseModel):
context: HttpUrl = Field(
alias='@context',
Expand Down Expand Up @@ -468,9 +515,20 @@ class CoreMetadata(SchemaBaseModel):
citation: Optional[List[str]] = Field(title="Citation", description="A bibliographic citation for the resource.")


class DatasetSchema(CoreMetadata):
class DatasetMetadata(CoreMetadata):
# used only for generating the JSON-LD schema for a dataset.
pass
variableMeasured: Optional[List[Union[str, PropertyValue]]] = Field(
title="Variables measured", description="Measured variables."
)
additionalProperty: Optional[List[PropertyValue]] = Field(
title="Additional properties",
default=[],
description="Additional properties of the Dataset."
)
sourceOrganization: Optional[Organization] = Field(
title="Source organization",
description="The organization that provided the data for this dataset."
)


class CoreMetadataDOC(CoreMetadata):
Expand All @@ -485,3 +543,59 @@ class Settings:
year=dt.year, month=dt.month, day=dt.day, hour=dt.hour, minute=dt.minute, second=dt.second
),
}


class DatasetMetadataDOC(CoreMetadataDOC, DatasetMetadata):
pass


class BaseAggregationMetadata(BaseModel):
"""Base class for aggregation metadata - used for metadata view in UI."""
type: str = Field(
alias="@type",
default="Aggregation",
const=True,
description="Type of aggregation."
)
name: Optional[str] = Field(
description="A text string with a descriptive name or title for the aggregation."
)
description: Optional[str] = Field(
description="A text string containing a description/abstract for the aggregation."
)
keywords: List[str] = Field(
min_items=0,
description="Keywords or tags used to describe the dataset, delimited by commas."
)
associatedMedia: Optional[List[MediaObject]] = Field(
title="Aggregation content",
description="A media object that encodes this aggregation."
)
spatialCoverage: Optional[Place] = Field(
description="The spatialCoverage of a CreativeWork indicates the place(s) which are the focus of the content. "
"It is a sub property of contentLocation intended primarily for more technical and "
"detailed materials. For example with a Dataset, it indicates areas that the dataset "
"describes: a dataset of New York weather would have spatialCoverage which was the "
"place: the state of New York."
)
temporalCoverage: Optional[TemporalCoverage] = Field(
title="Temporal coverage",
description="The time period that applies to all of the content within the aggregation."
)
additionalProperty: Optional[List[PropertyValue]] = Field(
title="Additional properties",
default=[],
description="Additional properties of the aggregation."
)


class NetCDFAggregationMetadata(BaseAggregationMetadata):
type: str = Field(
alias="@type",
default="Multidimensional Dataset",
const=True,
description="Type of aggregation."
)
variableMeasured: Optional[List[Union[str, PropertyValue]]] = Field(
title="Variables measured", description="Measured variables."
)
3 changes: 2 additions & 1 deletion hsextract/netcdf/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,13 @@ def get_nc_meta_dict(nc_file_name):
else:
nc_dataset = get_nc_dataset(nc_file_name)

if nc_dataset is None:
raise ValueError(f"The file:{nc_file_name} is not valid netcdf file.")
res_dublin_core_meta = get_dublin_core_meta(nc_dataset)
res_type_specific_meta = get_type_specific_meta(nc_dataset)
nc_dataset.close()

md = combine_metadata(res_dublin_core_meta, res_type_specific_meta)

md["content_files"] = [nc_file_name]
return md

Expand Down
4 changes: 3 additions & 1 deletion hsextract/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from pathlib import Path

from hsextract.adapters.hydroshare import HydroshareMetadataAdapter
from hsextract.adapters.hydroshare import HydroshareMetadataAdapter, NetCDFAggregationMetadataAdapter
from hsextract.listing.utils import prepare_files
from hsextract.models.schema import CoreMetadataDOC
from hsextract.raster.utils import extract_from_tif_file
Expand Down Expand Up @@ -50,6 +50,8 @@ def extract_metadata(type: str, filepath):
return json.loads(CoreMetadataDOC.construct(**extracted_metadata).json())
else:
extracted_metadata["associatedMedia"] = all_file_metadata
if type == "netcdf":
adapter = NetCDFAggregationMetadataAdapter()
catalog_record = json.loads(adapter.to_catalog_record(extracted_metadata).json())
return catalog_record

Expand Down

0 comments on commit e702fe6

Please sign in to comment.