Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion packages/evo-objects/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[project]
name = "evo-objects"
description = "Python SDK for using the Seequent Evo Geoscience Object API"
version = "0.4.1"
version = "0.4.2"
requires-python = ">=3.10"
license-files = ["LICENSE.md"]
dynamic = ["readme"]
Expand Down
36 changes: 30 additions & 6 deletions packages/evo-objects/src/evo/objects/utils/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,26 @@ def _iter_refs(target: Any, _key: str | None = None) -> Iterator[str]:
yield str(value)


def _get_schema_from_dataframe(dataframe: "pd.DataFrame") -> pa.Schema:
"""Get a normalized pyarrow schema from a pandas dataframe.

This hook centralizes dataframe-to-Arrow schema adjustments before the
dataframe is converted to a table. It currently performs these conversions:
- large_string -> string

:param dataframe: The pandas dataframe to get the schema from.
:return: A pyarrow schema with any configured type conversions applied.
"""
schema = pa.Schema.from_pandas(dataframe)
fields = []
for field in schema:
if pa.types.is_large_string(field.type):
fields.append(field.with_type(pa.string()))
else:
fields.append(field)
return pa.schema(fields)


class ObjectDataClient:
"""An optional wrapper around data upload and download functionality for geoscience objects.

Expand Down Expand Up @@ -203,7 +223,7 @@ async def upload_category_table(self, table: pa.Table, fb: IFeedback = NoFeedbac
columns_chunk_range = []
for column in table.itercolumns():
column = column.dictionary_encode()
if not (pa.types.is_string(column.type.value_type) or pa.types.is_large_string(column.type.value_type)):
if not pa.types.is_string(column.type.value_type):
raise TableFormatError("Category columns must be of type string")
if not pa.types.is_int32(column.type.index_type):
# Currently, we only support int32 indices
Expand Down Expand Up @@ -231,8 +251,6 @@ async def upload_category_table(self, table: pa.Table, fb: IFeedback = NoFeedbac
)

dictionary_values = all_chunks[0].dictionary
if pa.types.is_large_string(dictionary_values.type):
dictionary_values = pc.cast(dictionary_values, pa.string())
lookup = pa.Table.from_arrays(
[np.arange(len(dictionary_values), dtype=np.int32), dictionary_values], names=["key", "value"]
)
Expand Down Expand Up @@ -300,7 +318,9 @@ def save_dataframe(
no table formats are specified, raised when the table does not match any known format.
:raises StorageFileNotFoundError: If the destination does not exist or is not a directory.
"""
return self.save_table(pa.Table.from_pandas(dataframe), table_format=table_format)
schema = _get_schema_from_dataframe(dataframe)
table = pa.Table.from_pandas(dataframe, schema)
return self.save_table(table, table_format=table_format)

async def upload_dataframe(
self,
Expand All @@ -322,7 +342,9 @@ async def upload_dataframe(
:raises TableFormatError: If the provided table does not match any of the specified formats. If
no table formats are specified, raised when the table does not match any known format.
"""
table_info = await self.upload_table(pa.Table.from_pandas(dataframe), table_format=table_format, fb=fb)
schema = _get_schema_from_dataframe(dataframe)
table = pa.Table.from_pandas(dataframe, schema)
table_info = await self.upload_table(table, table_format=table_format, fb=fb)
return table_info

async def upload_category_dataframe(self, dataframe: pd.DataFrame, fb: IFeedback = NoFeedback) -> CategoryInfo:
Expand All @@ -342,7 +364,9 @@ async def upload_category_dataframe(self, dataframe: pd.DataFrame, fb: IFeedback
:raises TableFormatError: If the table isn't a valid category table, or if the number of categories exceeds
what int32 type can represent.
"""
category_info = await self.upload_category_table(pa.Table.from_pandas(dataframe), fb=fb)
schema = _get_schema_from_dataframe(dataframe)
table = pa.Table.from_pandas(dataframe, schema)
category_info = await self.upload_category_table(table, fb=fb)
return category_info

async def download_dataframe(
Expand Down
4 changes: 2 additions & 2 deletions packages/evo-objects/src/evo/objects/utils/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def _get_data_type(format_id: str) -> pa.DataType:
return pa.int64()
case "bool":
return pa.bool_()
case "string" | "large_string":
case "string":
return pa.string()
case "timestamp":
return pa.timestamp("us", tz="UTC")
Expand All @@ -88,7 +88,7 @@ def _get_format_id(data_type: pa.DataType) -> str:
return "int64"
case "bool":
return "bool"
case "string" | "large_string":
case "string":
return "string"
case "timestamp[us, tz=UTC]":
return "timestamp"
Expand Down
22 changes: 13 additions & 9 deletions packages/evo-objects/tests/test_data_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ def test_save_dataframe(self, _name: str, pass_table_format: bool) -> None:
mock.patch("evo.objects.utils.table_formats.get_known_format") as mock_get_known_format,
mock.patch("evo.common.io.upload.StorageDestination") as mock_destination,
mock.patch("pyarrow.Table") as mock_pyarrow_table,
mock.patch("evo.objects.utils.data._get_schema_from_dataframe", return_value=None),
):
mock_pyarrow_table.from_pandas.return_value = mock_table = mock.Mock()
mock_get_known_format.return_value = mock_known_format = mock.Mock(spec=KnownTableFormat)
Expand Down Expand Up @@ -262,6 +263,7 @@ async def test_upload_dataframe(self, _name: str, pass_table_format: bool) -> No
mock.patch("evo.objects.utils.table_formats.get_known_format") as mock_get_known_format,
mock.patch("evo.common.io.upload.StorageDestination", autospec=True) as mock_destination,
mock.patch("pyarrow.Table") as mock_pyarrow_table,
mock.patch("evo.objects.utils.data._get_schema_from_dataframe", return_value=None),
):
mock_pyarrow_table.from_pandas.return_value = mock_table = mock.Mock()
mock_get_known_format.return_value = mock_known_format = mock.Mock(spec=KnownTableFormat)
Expand Down Expand Up @@ -350,6 +352,7 @@ async def test_upload_dataframe_exists(self) -> None:
mock.patch("evo.objects.utils.table_formats.get_known_format") as mock_get_known_format,
mock.patch("evo.common.io.upload.StorageDestination", autospec=True) as mock_destination,
mock.patch("pyarrow.Table") as mock_pyarrow_table,
mock.patch("evo.objects.utils.data._get_schema_from_dataframe", return_value=None),
):
mock_pyarrow_table.from_pandas.return_value = mock_table = mock.Mock()
mock_get_known_format.return_value = mock_known_format = mock.Mock(spec=KnownTableFormat)
Expand Down Expand Up @@ -404,15 +407,6 @@ async def _mock_upload_file_side_effect(*args, **kwargs):
pa.table({"Category": pa.array([0, 1, 0, 2], type=pa.int32())}),
table_formats.INTEGER_ARRAY_1_INT32,
),
(
"large_string",
pa.table({"Category": pa.array(["A", "B", "A", "C"], type=pa.large_string())}),
pa.table(
{"key": pa.array([0, 1, 2], type=pa.int32()), "value": pa.array(["A", "B", "C"], type=pa.string())}
),
pa.table({"Category": pa.array([0, 1, 0, 2], type=pa.int32())}),
table_formats.INTEGER_ARRAY_1_INT32,
),
(
"multiple_columns",
pa.table(
Expand Down Expand Up @@ -559,6 +553,16 @@ def side_effect(table, table_format=None, fb=None):
self.assertEqual(category_info["values"]["table"], values_table)
self.assertEqual(category_info["values"]["table_format"], values_table_format)

async def test_upload_dataframe_converts_large_string_via_schema(self) -> None:
"""Test that upload_dataframe converts large_string columns (from pandas string[pyarrow]) to string via schema."""
dataframe = pd.DataFrame({"col": pd.Series(["A", "B", "C"], dtype="string[pyarrow]")})
with mock.patch.object(self.data_client, "upload_table", new_callable=mock.AsyncMock) as mock_upload:
mock_upload.return_value = {}
await self.data_client.upload_dataframe(dataframe)

(table,), _ = mock_upload.await_args
self.assertTrue(pa.types.is_string(table.schema.field("col").type))

async def test_download_table(self) -> None:
"""Test downloading tabular data using pyarrow."""
get_object_response = load_test_data("get_object.json")
Expand Down
4 changes: 2 additions & 2 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading