diff --git a/packages/evo-objects/pyproject.toml b/packages/evo-objects/pyproject.toml index c482e2f7..2eb00819 100644 --- a/packages/evo-objects/pyproject.toml +++ b/packages/evo-objects/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "evo-objects" description = "Python SDK for using the Seequent Evo Geoscience Object API" -version = "0.4.1" +version = "0.4.2" requires-python = ">=3.10" license-files = ["LICENSE.md"] dynamic = ["readme"] diff --git a/packages/evo-objects/src/evo/objects/utils/data.py b/packages/evo-objects/src/evo/objects/utils/data.py index 268c8206..357baed0 100644 --- a/packages/evo-objects/src/evo/objects/utils/data.py +++ b/packages/evo-objects/src/evo/objects/utils/data.py @@ -67,6 +67,26 @@ def _iter_refs(target: Any, _key: str | None = None) -> Iterator[str]: yield str(value) +def _get_schema_from_dataframe(dataframe: "pd.DataFrame") -> pa.Schema: + """Get a normalized pyarrow schema from a pandas dataframe. + + This hook centralizes dataframe-to-Arrow schema adjustments before the + dataframe is converted to a table. It currently performs these conversions: + - large_string -> string + + :param dataframe: The pandas dataframe to get the schema from. + :return: A pyarrow schema with any configured type conversions applied. + """ + schema = pa.Schema.from_pandas(dataframe) + fields = [] + for field in schema: + if pa.types.is_large_string(field.type): + fields.append(field.with_type(pa.string())) + else: + fields.append(field) + return pa.schema(fields) + + class ObjectDataClient: """An optional wrapper around data upload and download functionality for geoscience objects. @@ -203,7 +223,7 @@ async def upload_category_table(self, table: pa.Table, fb: IFeedback = NoFeedbac columns_chunk_range = [] for column in table.itercolumns(): column = column.dictionary_encode() - if not (pa.types.is_string(column.type.value_type) or pa.types.is_large_string(column.type.value_type)): + if not pa.types.is_string(column.type.value_type): raise TableFormatError("Category columns must be of type string") if not pa.types.is_int32(column.type.index_type): # Currently, we only support int32 indices @@ -231,8 +251,6 @@ async def upload_category_table(self, table: pa.Table, fb: IFeedback = NoFeedbac ) dictionary_values = all_chunks[0].dictionary - if pa.types.is_large_string(dictionary_values.type): - dictionary_values = pc.cast(dictionary_values, pa.string()) lookup = pa.Table.from_arrays( [np.arange(len(dictionary_values), dtype=np.int32), dictionary_values], names=["key", "value"] ) @@ -300,7 +318,9 @@ def save_dataframe( no table formats are specified, raised when the table does not match any known format. :raises StorageFileNotFoundError: If the destination does not exist or is not a directory. """ - return self.save_table(pa.Table.from_pandas(dataframe), table_format=table_format) + schema = _get_schema_from_dataframe(dataframe) + table = pa.Table.from_pandas(dataframe, schema) + return self.save_table(table, table_format=table_format) async def upload_dataframe( self, @@ -322,7 +342,9 @@ async def upload_dataframe( :raises TableFormatError: If the provided table does not match any of the specified formats. If no table formats are specified, raised when the table does not match any known format. """ - table_info = await self.upload_table(pa.Table.from_pandas(dataframe), table_format=table_format, fb=fb) + schema = _get_schema_from_dataframe(dataframe) + table = pa.Table.from_pandas(dataframe, schema) + table_info = await self.upload_table(table, table_format=table_format, fb=fb) return table_info async def upload_category_dataframe(self, dataframe: pd.DataFrame, fb: IFeedback = NoFeedback) -> CategoryInfo: @@ -342,7 +364,9 @@ async def upload_category_dataframe(self, dataframe: pd.DataFrame, fb: IFeedback :raises TableFormatError: If the table isn't a valid category table, or if the number of categories exceeds what int32 type can represent. """ - category_info = await self.upload_category_table(pa.Table.from_pandas(dataframe), fb=fb) + schema = _get_schema_from_dataframe(dataframe) + table = pa.Table.from_pandas(dataframe, schema) + category_info = await self.upload_category_table(table, fb=fb) return category_info async def download_dataframe( diff --git a/packages/evo-objects/src/evo/objects/utils/tables.py b/packages/evo-objects/src/evo/objects/utils/tables.py index 87719ec7..baae199a 100644 --- a/packages/evo-objects/src/evo/objects/utils/tables.py +++ b/packages/evo-objects/src/evo/objects/utils/tables.py @@ -64,7 +64,7 @@ def _get_data_type(format_id: str) -> pa.DataType: return pa.int64() case "bool": return pa.bool_() - case "string" | "large_string": + case "string": return pa.string() case "timestamp": return pa.timestamp("us", tz="UTC") @@ -88,7 +88,7 @@ def _get_format_id(data_type: pa.DataType) -> str: return "int64" case "bool": return "bool" - case "string" | "large_string": + case "string": return "string" case "timestamp[us, tz=UTC]": return "timestamp" diff --git a/packages/evo-objects/tests/test_data_client.py b/packages/evo-objects/tests/test_data_client.py index 7c08e38e..deaf2f52 100644 --- a/packages/evo-objects/tests/test_data_client.py +++ b/packages/evo-objects/tests/test_data_client.py @@ -113,6 +113,7 @@ def test_save_dataframe(self, _name: str, pass_table_format: bool) -> None: mock.patch("evo.objects.utils.table_formats.get_known_format") as mock_get_known_format, mock.patch("evo.common.io.upload.StorageDestination") as mock_destination, mock.patch("pyarrow.Table") as mock_pyarrow_table, + mock.patch("evo.objects.utils.data._get_schema_from_dataframe", return_value=None), ): mock_pyarrow_table.from_pandas.return_value = mock_table = mock.Mock() mock_get_known_format.return_value = mock_known_format = mock.Mock(spec=KnownTableFormat) @@ -262,6 +263,7 @@ async def test_upload_dataframe(self, _name: str, pass_table_format: bool) -> No mock.patch("evo.objects.utils.table_formats.get_known_format") as mock_get_known_format, mock.patch("evo.common.io.upload.StorageDestination", autospec=True) as mock_destination, mock.patch("pyarrow.Table") as mock_pyarrow_table, + mock.patch("evo.objects.utils.data._get_schema_from_dataframe", return_value=None), ): mock_pyarrow_table.from_pandas.return_value = mock_table = mock.Mock() mock_get_known_format.return_value = mock_known_format = mock.Mock(spec=KnownTableFormat) @@ -350,6 +352,7 @@ async def test_upload_dataframe_exists(self) -> None: mock.patch("evo.objects.utils.table_formats.get_known_format") as mock_get_known_format, mock.patch("evo.common.io.upload.StorageDestination", autospec=True) as mock_destination, mock.patch("pyarrow.Table") as mock_pyarrow_table, + mock.patch("evo.objects.utils.data._get_schema_from_dataframe", return_value=None), ): mock_pyarrow_table.from_pandas.return_value = mock_table = mock.Mock() mock_get_known_format.return_value = mock_known_format = mock.Mock(spec=KnownTableFormat) @@ -404,15 +407,6 @@ async def _mock_upload_file_side_effect(*args, **kwargs): pa.table({"Category": pa.array([0, 1, 0, 2], type=pa.int32())}), table_formats.INTEGER_ARRAY_1_INT32, ), - ( - "large_string", - pa.table({"Category": pa.array(["A", "B", "A", "C"], type=pa.large_string())}), - pa.table( - {"key": pa.array([0, 1, 2], type=pa.int32()), "value": pa.array(["A", "B", "C"], type=pa.string())} - ), - pa.table({"Category": pa.array([0, 1, 0, 2], type=pa.int32())}), - table_formats.INTEGER_ARRAY_1_INT32, - ), ( "multiple_columns", pa.table( @@ -559,6 +553,16 @@ def side_effect(table, table_format=None, fb=None): self.assertEqual(category_info["values"]["table"], values_table) self.assertEqual(category_info["values"]["table_format"], values_table_format) + async def test_upload_dataframe_converts_large_string_via_schema(self) -> None: + """Test that upload_dataframe converts large_string columns (from pandas string[pyarrow]) to string via schema.""" + dataframe = pd.DataFrame({"col": pd.Series(["A", "B", "C"], dtype="string[pyarrow]")}) + with mock.patch.object(self.data_client, "upload_table", new_callable=mock.AsyncMock) as mock_upload: + mock_upload.return_value = {} + await self.data_client.upload_dataframe(dataframe) + + (table,), _ = mock_upload.await_args + self.assertTrue(pa.types.is_string(table.schema.field("col").type)) + async def test_download_table(self) -> None: """Test downloading tabular data using pyarrow.""" get_object_response = load_test_data("get_object.json") diff --git a/uv.lock b/uv.lock index ae34d85d..70341bb0 100644 --- a/uv.lock +++ b/uv.lock @@ -1073,7 +1073,7 @@ test = [ [[package]] name = "evo-objects" -version = "0.4.0" +version = "0.4.2" source = { editable = "packages/evo-objects" } dependencies = [ { name = "evo-sdk-common", extra = ["jmespath"] }, @@ -1226,7 +1226,7 @@ test = [ [[package]] name = "evo-sdk-common" -version = "0.5.20" +version = "0.5.21" source = { editable = "packages/evo-sdk-common" } dependencies = [ { name = "pure-interface" },