Skip to content

Commit

Permalink
fix: fix an issue where pagination on list files did not work properly
Browse files Browse the repository at this point in the history
  • Loading branch information
nicornk committed Dec 5, 2024
1 parent 42cccdc commit a139d88
Show file tree
Hide file tree
Showing 3 changed files with 142 additions and 6 deletions.
19 changes: 13 additions & 6 deletions libs/foundry-dev-tools/src/foundry_dev_tools/clients/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def list_dataset_files(
page_size: int = 1000,
logical_path: api_types.PathInDataset | None = None,
page_start_logical_path: api_types.PathInDataset | None = None,
start_transaction_rid: api_types.TransactionRid | None = None,
include_open_exclusive_transaction: bool = False,
exclude_hidden_files: bool = False,
temporary_credentials_auth_token: str | None = None,
Expand All @@ -43,6 +44,8 @@ def list_dataset_files(
(a slash is added to the end of logicalPath if necessary and a prefix-match is performed)
page_start_logical_path: if specified page starts at the given path,
otherwise at the beginning of the file list
start_transaction_rid: if a startTransactionRid is given, the view starting at the startTransactionRid
and ending at the endRef is returned
include_open_exclusive_transaction: if files added in open transaction should be returned
as well in the response
exclude_hidden_files: if hidden files should be excluded (e.g. _log files)
Expand All @@ -62,7 +65,7 @@ def list_dataset_files(
]
"""

def _inner_get(next_page_token: str | None = None) -> dict:
def _inner_get(page_start_logical_path: str | None = None) -> dict:
return self.api_get_dataset_view_files3(
dataset_rid=dataset_rid,
end_ref=end_ref,
Expand All @@ -71,14 +74,17 @@ def _inner_get(next_page_token: str | None = None) -> dict:
page_start_logical_path=page_start_logical_path,
include_open_exclusive_transaction=include_open_exclusive_transaction,
exclude_hidden_files=exclude_hidden_files,
start_transaction_rid=next_page_token,
start_transaction_rid=start_transaction_rid,
temporary_credentials_auth_token=temporary_credentials_auth_token,
).json()

result: list[dict] = []
batch_result = {"nextPageToken": ""}
while batch_result["nextPageToken"] is not None:
batch_result = _inner_get(next_page_token=batch_result["nextPageToken"])
first_result = _inner_get(page_start_logical_path=page_start_logical_path)
result.extend(first_result["values"])
next_page_token = first_result.get("nextPageToken", None)
while next_page_token is not None:
batch_result = _inner_get(page_start_logical_path=next_page_token)
next_page_token = batch_result.get("nextPageToken", None)
result.extend(batch_result["values"]) # type: ignore[arg-type]
return result

Expand Down Expand Up @@ -110,7 +116,8 @@ def api_get_dataset_view_files3(
include_open_exclusive_transaction: if files added in open transaction should be returned
as well in the response
exclude_hidden_files: if hidden files should be excluded (e.g. _log files)
start_transaction_rid: start transaction rid
start_transaction_rid: if a startTransactionRid is given, the view starting at the startTransactionRid
and ending at the endRef is returned
temporary_credentials_auth_token: to generate temporary credentials for presigned URLs
**kwargs: gets passed to :py:meth:`APIClient.api_request`
Expand Down
4 changes: 4 additions & 0 deletions tests/integration/test_general_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,10 @@ def test_monster_integration_test(): # noqa: PLR0915

TEST_SINGLETON.v1_client.commit_transaction(ds["rid"], transaction_rid)

files = TEST_SINGLETON.ctx.catalog.list_dataset_files(dataset_rid=ds["rid"], end_ref=branch, page_size=1)

assert len(files) == 3

schema = {
"fieldSchemaList": [
{"type": "INTEGER", "name": "col1", "customMetadata": {}},
Expand Down
125 changes: 125 additions & 0 deletions tests/unit/clients/test_catalog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
from foundry_dev_tools.utils.clients import build_api_url
from tests.unit.mocks import TEST_HOST


def test_list_dataset_files(test_context_mock):
test_context_mock.mock_adapter.register_uri(
"PUT",
build_api_url(TEST_HOST.url, "foundry-catalog", "catalog/datasets/rid/views/master/files3"),
response_list=[
{
"json": {
"values": [
{
"logicalPath": "dummy_file_0.parquet",
"physicalPath": "1234",
"physicalUri": "https://s3.eu-central-1.amazonaws.com/",
"transactionRid": "ri.foundry.main.transaction.000002c3-6680-ad68-8d6b-500ef09cbd46",
"fileMetadata": {"length": 523},
"isOpen": False,
"timeModified": "2024-12-05T14:36:18.413Z",
}
],
"nextPageToken": "dummy_file_1.parquet",
}
},
{
"json": {
"values": [
{
"logicalPath": "dummy_file_1.parquet",
"physicalPath": "2345",
"physicalUri": "https://s3.eu-central-1.amazonaws.com/",
"transactionRid": "ri.foundry.main.transaction.000002c3-6680-ad68-8d6b-500ef09cbd46",
"fileMetadata": {"length": 523},
"isOpen": False,
"timeModified": "2024-12-05T14:36:18.413Z",
}
],
"nextPageToken": "dummy_file_2.parquet",
}
},
{
"json": {
"values": [
{
"logicalPath": "dummy_file_2.parquet",
"physicalPath": "234234",
"physicalUri": "https://s3.eu-central-1.amazonaws.com/",
"transactionRid": "ri.foundry.main.transaction.000002c3-6680-ad68-8d6b-500ef09cbd46",
"fileMetadata": {"length": 523},
"isOpen": False,
"timeModified": "2024-12-05T14:36:18.413Z",
}
],
"nextPageToken": None,
}
},
],
)

files = test_context_mock.catalog.list_dataset_files(dataset_rid="rid", page_size=1)

assert len(files) == 3
assert test_context_mock.mock_adapter.call_count == 3


def test_list_dataset_files_nextpagetoken_not_present(test_context_mock):
test_context_mock.mock_adapter.register_uri(
"PUT",
build_api_url(TEST_HOST.url, "foundry-catalog", "catalog/datasets/rid/views/master/files3"),
response_list=[
{
"json": {
"values": [
{
"logicalPath": "dummy_file_0.parquet",
"physicalPath": "1234",
"physicalUri": "https://s3.eu-central-1.amazonaws.com/",
"transactionRid": "ri.foundry.main.transaction.000002c3-6680-ad68-8d6b-500ef09cbd46",
"fileMetadata": {"length": 523},
"isOpen": False,
"timeModified": "2024-12-05T14:36:18.413Z",
}
],
"nextPageToken": "dummy_file_1.parquet",
}
},
{
"json": {
"values": [
{
"logicalPath": "dummy_file_1.parquet",
"physicalPath": "2345",
"physicalUri": "https://s3.eu-central-1.amazonaws.com/",
"transactionRid": "ri.foundry.main.transaction.000002c3-6680-ad68-8d6b-500ef09cbd46",
"fileMetadata": {"length": 523},
"isOpen": False,
"timeModified": "2024-12-05T14:36:18.413Z",
}
],
"nextPageToken": "dummy_file_2.parquet",
}
},
{
"json": {
"values": [
{
"logicalPath": "dummy_file_2.parquet",
"physicalPath": "234234",
"physicalUri": "https://s3.eu-central-1.amazonaws.com/",
"transactionRid": "ri.foundry.main.transaction.000002c3-6680-ad68-8d6b-500ef09cbd46",
"fileMetadata": {"length": 523},
"isOpen": False,
"timeModified": "2024-12-05T14:36:18.413Z",
}
]
}
},
],
)

files = test_context_mock.catalog.list_dataset_files(dataset_rid="rid", page_size=1)

assert len(files) == 3
assert test_context_mock.mock_adapter.call_count == 3

0 comments on commit a139d88

Please sign in to comment.