SeequentEvo · davidknight-seequent · Mar 24, 2026 · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026
@@ -1,7 +1,7 @@
 [project]
 name = "evo-objects"
 description = "Python SDK for using the Seequent Evo Geoscience Object API"
-version = "0.4.1"
+version = "0.4.2"
 requires-python = ">=3.10"
 license-files = ["LICENSE.md"]
 dynamic = ["readme"]

@@ -67,6 +67,26 @@ def _iter_refs(target: Any, _key: str | None = None) -> Iterator[str]:
             yield str(value)
 
 
+def _get_schema_from_dataframe(dataframe: "pd.DataFrame") -> pa.Schema:
+    """Get a normalized pyarrow schema from a pandas dataframe.
+
+    This hook centralizes dataframe-to-Arrow schema adjustments before the
+    dataframe is converted to a table. It currently performs these conversions:
+    - large_string -> string
+
+    :param dataframe: The pandas dataframe to get the schema from.
+    :return: A pyarrow schema with any configured type conversions applied.
+    """
+    schema = pa.Schema.from_pandas(dataframe)
+    fields = []
+    for field in schema:
+        if pa.types.is_large_string(field.type):
+            fields.append(field.with_type(pa.string()))
+        else:
+            fields.append(field)
+    return pa.schema(fields)
+
+
 class ObjectDataClient:
     """An optional wrapper around data upload and download functionality for geoscience objects.
 
@@ -203,7 +223,7 @@ async def upload_category_table(self, table: pa.Table, fb: IFeedback = NoFeedbac
         columns_chunk_range = []
         for column in table.itercolumns():
             column = column.dictionary_encode()
-            if not (pa.types.is_string(column.type.value_type) or pa.types.is_large_string(column.type.value_type)):
+            if not pa.types.is_string(column.type.value_type):
                 raise TableFormatError("Category columns must be of type string")
             if not pa.types.is_int32(column.type.index_type):
                 # Currently, we only support int32 indices
@@ -231,8 +251,6 @@ async def upload_category_table(self, table: pa.Table, fb: IFeedback = NoFeedbac
         )
 
         dictionary_values = all_chunks[0].dictionary
-        if pa.types.is_large_string(dictionary_values.type):
-            dictionary_values = pc.cast(dictionary_values, pa.string())
         lookup = pa.Table.from_arrays(
             [np.arange(len(dictionary_values), dtype=np.int32), dictionary_values], names=["key", "value"]
         )
@@ -300,7 +318,9 @@ def save_dataframe(
                 no table formats are specified, raised when the table does not match any known format.
             :raises StorageFileNotFoundError: If the destination does not exist or is not a directory.
             """
-            return self.save_table(pa.Table.from_pandas(dataframe), table_format=table_format)
+            schema = _get_schema_from_dataframe(dataframe)
+            table = pa.Table.from_pandas(dataframe, schema)
+            return self.save_table(table, table_format=table_format)
 
         async def upload_dataframe(
             self,
@@ -322,7 +342,9 @@ async def upload_dataframe(
             :raises TableFormatError: If the provided table does not match any of the specified formats. If
                 no table formats are specified, raised when the table does not match any known format.
             """
-            table_info = await self.upload_table(pa.Table.from_pandas(dataframe), table_format=table_format, fb=fb)
+            schema = _get_schema_from_dataframe(dataframe)
+            table = pa.Table.from_pandas(dataframe, schema)
+            table_info = await self.upload_table(table, table_format=table_format, fb=fb)
             return table_info
 
         async def upload_category_dataframe(self, dataframe: pd.DataFrame, fb: IFeedback = NoFeedback) -> CategoryInfo:
@@ -342,7 +364,9 @@ async def upload_category_dataframe(self, dataframe: pd.DataFrame, fb: IFeedback
             :raises TableFormatError: If the table isn't a valid category table, or if the number of categories exceeds
                 what int32 type can represent.
             """
-            category_info = await self.upload_category_table(pa.Table.from_pandas(dataframe), fb=fb)
+            schema = _get_schema_from_dataframe(dataframe)
+            table = pa.Table.from_pandas(dataframe, schema)
+            category_info = await self.upload_category_table(table, fb=fb)
             return category_info
 
         async def download_dataframe(

@@ -64,7 +64,7 @@ def _get_data_type(format_id: str) -> pa.DataType:
                 return pa.int64()
             case "bool":
                 return pa.bool_()
-            case "string" | "large_string":
+            case "string":
                 return pa.string()
             case "timestamp":
                 return pa.timestamp("us", tz="UTC")
@@ -88,7 +88,7 @@ def _get_format_id(data_type: pa.DataType) -> str:
                 return "int64"
             case "bool":
                 return "bool"
-            case "string" | "large_string":
+            case "string":
                 return "string"
             case "timestamp[us, tz=UTC]":
                 return "timestamp"

@@ -113,6 +113,7 @@ def test_save_dataframe(self, _name: str, pass_table_format: bool) -> None:
             mock.patch("evo.objects.utils.table_formats.get_known_format") as mock_get_known_format,
             mock.patch("evo.common.io.upload.StorageDestination") as mock_destination,
             mock.patch("pyarrow.Table") as mock_pyarrow_table,
+            mock.patch("evo.objects.utils.data._get_schema_from_dataframe", return_value=None),
         ):
             mock_pyarrow_table.from_pandas.return_value = mock_table = mock.Mock()
             mock_get_known_format.return_value = mock_known_format = mock.Mock(spec=KnownTableFormat)
@@ -262,6 +263,7 @@ async def test_upload_dataframe(self, _name: str, pass_table_format: bool) -> No
             mock.patch("evo.objects.utils.table_formats.get_known_format") as mock_get_known_format,
             mock.patch("evo.common.io.upload.StorageDestination", autospec=True) as mock_destination,
             mock.patch("pyarrow.Table") as mock_pyarrow_table,
+            mock.patch("evo.objects.utils.data._get_schema_from_dataframe", return_value=None),
         ):
             mock_pyarrow_table.from_pandas.return_value = mock_table = mock.Mock()
             mock_get_known_format.return_value = mock_known_format = mock.Mock(spec=KnownTableFormat)
@@ -350,6 +352,7 @@ async def test_upload_dataframe_exists(self) -> None:
             mock.patch("evo.objects.utils.table_formats.get_known_format") as mock_get_known_format,
             mock.patch("evo.common.io.upload.StorageDestination", autospec=True) as mock_destination,
             mock.patch("pyarrow.Table") as mock_pyarrow_table,
+            mock.patch("evo.objects.utils.data._get_schema_from_dataframe", return_value=None),
         ):
             mock_pyarrow_table.from_pandas.return_value = mock_table = mock.Mock()
             mock_get_known_format.return_value = mock_known_format = mock.Mock(spec=KnownTableFormat)
@@ -404,15 +407,6 @@ async def _mock_upload_file_side_effect(*args, **kwargs):
                 pa.table({"Category": pa.array([0, 1, 0, 2], type=pa.int32())}),
                 table_formats.INTEGER_ARRAY_1_INT32,
             ),
-            (
-                "large_string",
-                pa.table({"Category": pa.array(["A", "B", "A", "C"], type=pa.large_string())}),
-                pa.table(
-                    {"key": pa.array([0, 1, 2], type=pa.int32()), "value": pa.array(["A", "B", "C"], type=pa.string())}
-                ),
-                pa.table({"Category": pa.array([0, 1, 0, 2], type=pa.int32())}),
-                table_formats.INTEGER_ARRAY_1_INT32,
-            ),
             (
                 "multiple_columns",
                 pa.table(
@@ -559,6 +553,16 @@ def side_effect(table, table_format=None, fb=None):
         self.assertEqual(category_info["values"]["table"], values_table)
         self.assertEqual(category_info["values"]["table_format"], values_table_format)
 
+    async def test_upload_dataframe_converts_large_string_via_schema(self) -> None:
+        """Test that upload_dataframe converts large_string columns (from pandas string[pyarrow]) to string via schema."""
+        dataframe = pd.DataFrame({"col": pd.Series(["A", "B", "C"], dtype="string[pyarrow]")})
+        with mock.patch.object(self.data_client, "upload_table", new_callable=mock.AsyncMock) as mock_upload:
+            mock_upload.return_value = {}
+            await self.data_client.upload_dataframe(dataframe)
+
+        (table,), _ = mock_upload.await_args
+        self.assertTrue(pa.types.is_string(table.schema.field("col").type))
+
     async def test_download_table(self) -> None:
         """Test downloading tabular data using pyarrow."""
         get_object_response = load_test_data("get_object.json")