DEP: Deprecate content as dict input, split in two arguments (#1016)

tnatt · web-flow · commit 6c66f57f283a · 2025-02-20T12:16:05.000+01:00
diff --git a/docs/src/dataio_3_migration.rst b/docs/src/dataio_3_migration.rst
@@ -20,7 +20,9 @@ but with replacements inplace.
    a dictionary form to provide a reference together with the ``vertical_domain`` is deprecated, use 
    the ``domain_reference`` argument instead.
  - ``workflow`` now only supports string input, example ``workflow='Structural modelling'``.
- - ``content`` was previously optional, it should now be explicitly provided.
+ - ``content`` was previously optional, it should now be explicitly provided as a valid content string.
+ - ``content`` no longer supports using a dictionary form to provide extra information together
+   with the ``content``, use the ``content_metadata`` argument instead.
  - ``content={'seismic': {'offset': '0-15'}}`` no longer works, use the key ``stacking_offset`` instead 
    of ``offset``.
 
@@ -32,6 +34,7 @@ Following are an example demonstrating several deprecated patterns:
     from fmu.dataio import ExportData
 
     ExportData(
+        content={'seismic': {'attribute': 'amplitude', 'calculation': 'mean'}}, # ⛔️ 
         fmu_context='preprocessed', # ⛔️ 
         access_ssdl={'access_level': 'asset', 'rep_include': True}, # ⛔️ 
         vertical_domain={'depth': 'msl'}, # ⛔️ 
@@ -45,7 +48,8 @@ Change to this instead 👇:
     from fmu.dataio import ExportData
 
     ExportData(
-        content='depth', # ✅ content must explicitly be provided
+        content='seismic', # ✅ content must explicitly be provided as a string
+        content_metadata={'attribute': 'amplitude', 'calculation': 'mean'}, # ✅
         preprocessed=True, # ✅
         classification='restricted', # ✅ note the use of 'restricted' instead of 'asset'
         rep_include=True, # ✅
diff --git a/src/fmu/dataio/dataio.py b/src/fmu/dataio/dataio.py
@@ -223,9 +223,14 @@ class ExportData:
             Note also: If missing or empty, export() may still be done, but without a
             metadata file (this feature may change in future releases).
 
-        content: Optional. Is a string or a dictionary with one key.
-            Example is "depth" or {"fluid_contact": {"xxx": "yyy", "zzz": "uuu"}}.
+        content: A required string describing the content of the data e.g. 'volumes'.
             Content is checked agains a white-list for validation!
+            Some contents like 'seismic' requires additional information. This
+            should be provided through the 'content_metadata' argument.
+
+        content_metadata: Optional. Dictionary with additional information about the
+            provided content. Only required for some contents, e.g. 'seismic'.
+            Example {"attribute": "amplitude", "calculation": "mean"}.
 
         fmu_context: Optional string with value ``realization`` or ``case``. If not
             explicitly given it will be inferred based on the presence of ERT
@@ -361,6 +366,7 @@ class ExportData:
     classification: Optional[str] = None
     config: dict | GlobalConfiguration = field(default_factory=dict)
     content: Optional[Union[dict, str]] = None
+    content_metadata: Optional[dict] = None
     depth_reference: Optional[str] = None  # deprecated
     domain_reference: str = "msl"
     description: Union[str, list] = ""
@@ -521,6 +527,42 @@ def _get_rep_include(self) -> bool:
         logger.debug("Using default 'rep_include'=False")
         return False
 
+    def _get_content_enum(self) -> Optional[enums.Content]:
+        """Get the content enum."""
+        if self.content is None:
+            logger.debug("content not set from input, returning None'")
+            return None
+
+        if isinstance(self.content, str):
+            logger.debug("content is set from string input")
+            return enums.Content(self.content)
+
+        if isinstance(self.content, dict):
+            logger.debug("content is set from dict input")
+            return enums.Content(next(iter(self.content)))
+
+        raise ValueError(
+            "Incorrect format found for 'content'. It should be a valid "
+            f"content string: {[m.value for m in enums.Content]}"
+        )
+
+    def _get_content_metadata(self) -> Optional[dict]:
+        """
+        Get the content metadata if provided by as input, else return None.
+        Validation takes place in the objectdata provider.
+        """
+        if self.content_metadata:
+            logger.debug("content_metadata is set from content_metadata argument")
+            return self.content_metadata
+
+        if isinstance(self.content, dict):
+            logger.debug("content_metadata is set from content argument")
+            content_enum = self._get_content_enum()
+            return self.content[content_enum]
+
+        logger.debug("Found no content_metadata, returning None")
+        return None
+
     def _show_deprecations_or_notimplemented(self) -> None:
         """Warn on deprecated keys or on stuff not implemented yet."""
 
@@ -538,6 +580,15 @@ def _show_deprecations_or_notimplemented(self) -> None:
                     "is not supported."
                 )
 
+        if isinstance(self.content, dict):
+            warn(
+                "Using the 'content' argument to set both the content and "
+                "and the content metadata will be deprecated. Set the 'content' "
+                "argument to a valid content string, and provide the extra "
+                "information through the 'content_metadata' argument instead.",
+                FutureWarning,
+            )
+
         if self.runpath:
             warn(
                 "The 'runpath' key has currently no function. It will be evaluated for "
diff --git a/src/fmu/dataio/providers/objectdata/_base.py b/src/fmu/dataio/providers/objectdata/_base.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 from abc import abstractmethod
-from copy import deepcopy
 from dataclasses import dataclass, field
 from datetime import datetime
 from typing import TYPE_CHECKING, Any, Final
@@ -18,16 +17,23 @@
 from fmu.dataio._utils import generate_description
 from fmu.dataio.exceptions import ConfigurationError
 from fmu.dataio.providers._base import Provider
-from fmu.dataio.providers.objectdata._export_models import AllowedContent, UnsetData
+from fmu.dataio.providers.objectdata._export_models import (
+    UnsetData,
+    content_metadata_factory,
+    content_requires_metadata,
+    property_warn,
+)
 
 if TYPE_CHECKING:
-    from fmu.dataio._model.data import (
+    from pydantic import BaseModel
+
+    from fmu.dataio._models.fmu_results.data import (
         BoundingBox2D,
         BoundingBox3D,
         Geometry,
     )
-    from fmu.dataio._models._fmu_results.enums import FMUClass, Layout
-    from fmu.dataio._models._fmu_results.specification import AnySpecification
+    from fmu.dataio._models.fmu_results.enums import FMUClass, Layout
+    from fmu.dataio._models.fmu_results.specification import AnySpecification
     from fmu.dataio.dataio import ExportData
     from fmu.dataio.types import Inferrable
 
@@ -65,7 +71,9 @@ def __post_init__(self) -> None:
                 raise ValueError("Can't use absolute path as 'forcefolder'")
             logger.info(f"Using forcefolder {self.dataio.forcefolder}")
 
-        content_model = self._get_validated_content(self.dataio.content)
+        content = self.dataio._get_content_enum() or "unset"
+        content_metadata = self._get_validated_content_metadata()
+
         strat_element = self._get_stratigraphy_element()
         self.name = strat_element.name
 
@@ -77,11 +85,9 @@ def __post_init__(self) -> None:
         metadata["top"] = strat_element.top
         metadata["base"] = strat_element.base
 
-        metadata["content"] = (usecontent := content_model.content)
-        if content_model.content_incl_specific:
-            metadata[usecontent] = getattr(
-                content_model.content_incl_specific, usecontent, None
-            )
+        metadata["content"] = content
+        if content_metadata:
+            metadata[content] = content_metadata
         metadata["product"] = self.product
         metadata["tagname"] = self.dataio.tagname
         metadata["format"] = self.fmt
@@ -154,30 +160,40 @@ def get_metadata(self) -> AnyData | UnsetData:
         assert self._metadata is not None
         return self._metadata
 
-    def _get_validated_content(self, content: str | dict | None) -> AllowedContent:
-        """Check content and return a validated model."""
-        logger.info("Evaluate content")
-        logger.debug("content is %s of type %s", str(content), type(content))
-
-        if not content:
-            return AllowedContent(content="unset")
-
-        if isinstance(content, str):
-            return AllowedContent(content=Content(content))
-
-        if len(content) > 1:
-            raise ValueError(
-                "Found more than one content item in the 'content' dictionary. Ensure "
-                "input is formatted as content={'mycontent': {extra_key: extra_value}}."
-            )
-        content = deepcopy(content)
-        usecontent, content_specific = next(iter(content.items()))
-        logger.debug("usecontent is %s", usecontent)
-        logger.debug("content_specific is %s", content_specific)
+    def _get_validated_content_metadata(self) -> BaseModel | None:
+        """
+        If content_metadata have been input, and the content requires metadata,
+        return the corresponding validated content_metadata model for the content,
+        else return None.
+        """
+        content = self.dataio._get_content_enum()
+        if content:
+            content_metadata = self.dataio._get_content_metadata()
+
+            if content_requires_metadata(content):
+                if not content_metadata:
+                    return self._raise_error_for_missing_content_metadata(content)
+
+                content_metadata_model = content_metadata_factory(content)
+                if not isinstance(content_metadata, dict):
+                    raise ValueError(
+                        "The 'content_metadata' needs to be input as a dictionary. "
+                        f"Possible keys for content '{content.value}' are: "
+                        f"{list(content_metadata_model.model_fields)}. "
+                    )
+                return content_metadata_model.model_validate(content_metadata)
+        return None
 
-        return AllowedContent.model_validate(
-            {"content": Content(usecontent), "content_incl_specific": content}
-        )
+    @staticmethod
+    def _raise_error_for_missing_content_metadata(content: Content) -> None:
+        """
+        Raise error for missing content_metadata with special handling of the
+        'property' content which is required to have content_metadata in the future.
+        """
+        if content == Content.property:
+            property_warn()
+        else:
+            raise ValueError(f"Content {content.value} requires additional input")
 
     def _get_stratigraphy_element(self) -> StratigraphyElement:
         """Derive the name and stratigraphy for the object; may have several sources.
diff --git a/src/fmu/dataio/providers/objectdata/_export_models.py b/src/fmu/dataio/providers/objectdata/_export_models.py
@@ -9,7 +9,7 @@
 
 import warnings
 from textwrap import dedent
-from typing import Final, Literal, Optional, Union
+from typing import Final, Literal, Optional, Type
 
 from pydantic import (
     BaseModel,
@@ -60,43 +60,6 @@ class AllowedContentProperty(BaseModel):
     is_discrete: Optional[bool] = Field(default=None)
 
 
-class ContentRequireSpecific(BaseModel):
-    field_outline: Optional[data.FieldOutline] = Field(default=None)
-    field_region: Optional[data.FieldRegion] = Field(default=None)
-    fluid_contact: Optional[data.FluidContact] = Field(default=None)
-    property: Optional[AllowedContentProperty] = Field(default=None)
-    seismic: Optional[AllowedContentSeismic] = Field(default=None)
-
-
-class AllowedContent(BaseModel):
-    content: Union[enums.Content, Literal["unset"]]
-    content_incl_specific: Optional[ContentRequireSpecific] = Field(default=None)
-
-    @model_validator(mode="before")
-    @classmethod
-    def _validate_input(cls, values: dict) -> dict:
-        content = values.get("content")
-        content_specific = values.get("content_incl_specific", {}).get(content)
-
-        if content in ContentRequireSpecific.model_fields and not content_specific:
-            # 'property' should be included below after a deprecation period
-            if content == enums.Content.property:
-                property_warn()
-            else:
-                raise ValueError(f"Content {content} requires additional input")
-
-        if content_specific and not isinstance(content_specific, dict):
-            raise ValueError(
-                "Content is incorrectly formatted. When giving content as a dict, "
-                "it must be formatted as: "
-                "{'mycontent': {extra_key: extra_value}} where mycontent is a string "
-                "and in the list of valid contents, and extra keys in associated "
-                "dictionary must be valid keys for this content."
-            )
-
-        return values
-
-
 class UnsetData(data.Data):
     content: Literal["unset"]  # type: ignore
 
@@ -111,3 +74,27 @@ def _deprecation_warning(self) -> UnsetData:
             FutureWarning,
         )
         return self
+
+
+def content_metadata_factory(content: enums.Content) -> Type[BaseModel]:
+    """Return the correct content_metadata model based on provided content."""
+    if content == enums.Content.field_outline:
+        return data.FieldOutline
+    if content == enums.Content.field_region:
+        return data.FieldRegion
+    if content == enums.Content.fluid_contact:
+        return data.FluidContact
+    if content == enums.Content.property:
+        return AllowedContentProperty
+    if content == enums.Content.seismic:
+        return AllowedContentSeismic
+    raise ValueError(f"No content_metadata model exist for content {str(content)}")
+
+
+def content_requires_metadata(content: enums.Content) -> bool:
+    """Flag if given content requires content_metadata"""
+    try:
+        content_metadata_factory(content)
+        return True
+    except ValueError:
+        return False
diff --git a/tests/test_units/test_contents.py b/tests/test_units/test_contents.py
@@ -3,7 +3,9 @@
 import pytest
 from pydantic import ValidationError
 
+from fmu.dataio._models.fmu_results import enums
 from fmu.dataio.dataio import ExportData
+from fmu.dataio.providers.objectdata._export_models import content_requires_metadata
 
 # generic testing of functionality related to content is done elsewhere,
 # mainly in test_dataio.py.
@@ -83,7 +85,7 @@ def test_content_fluid_contact_case_insensitive(regsurf, globalconfig2):
 
 def test_content_fluid_contact_raises_on_invalid_contact(regsurf, globalconfig2):
     """Test export of the fluid_contact content."""
-    with pytest.raises(ValidationError, match="fluid_contact"):
+    with pytest.raises(ValidationError, match="FluidContact"):
         ExportData(
             config=globalconfig2,
             name="MyName",
@@ -333,3 +335,19 @@ def test_content_wellpicks(dataframe, globalconfig2):
     ).generate_metadata(dataframe)
 
     assert meta["data"]["content"] == "wellpicks"
+
+
+def test_content_requires_metadata():
+    """Test the content_requires_metadata function"""
+
+    # test contents that requires extra
+    assert content_requires_metadata(enums.Content.field_outline)
+    assert content_requires_metadata(enums.Content.field_region)
+    assert content_requires_metadata(enums.Content.fluid_contact)
+    assert content_requires_metadata(enums.Content.property)
+    assert content_requires_metadata(enums.Content.seismic)
+
+    # test some random contents that does not require extra
+    assert not content_requires_metadata(enums.Content.depth)
+    assert not content_requires_metadata(enums.Content.timeseries)
+    assert not content_requires_metadata(enums.Content.volumes)
diff --git a/tests/test_units/test_dataio.py b/tests/test_units/test_dataio.py
@@ -470,7 +470,7 @@ def test_content_invalid_dict(globalconfig1, regsurf):
     eobj = ExportData(
         config=globalconfig1, content={"seismic": "some_key", "extra": "some_value"}
     )
-    with pytest.raises(ValueError, match="Found more than one content item"):
+    with pytest.raises(ValueError):
         eobj.generate_metadata(regsurf)
 
 
@@ -483,7 +483,7 @@ def test_content_valid_string(regsurf, globalconfig2):
 
 def test_seismic_content_require_seismic_data(globalconfig2, regsurf):
     eobj = ExportData(config=globalconfig2, content="seismic")
-    with pytest.raises(pydantic.ValidationError, match="requires additional input"):
+    with pytest.raises(ValueError, match="requires additional input"):
         eobj.generate_metadata(regsurf)
 
 
@@ -522,7 +522,7 @@ def test_content_is_a_wrongly_formatted_dict(globalconfig2, regsurf):
         name="TopVolantis",
         content={"seismic": "myvalue"},
     )
-    with pytest.raises(ValueError, match="incorrectly formatted"):
+    with pytest.raises(ValueError):
         eobj.generate_metadata(regsurf)