Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Serialize query strings to avoid Ray Dataset column imputation #1171

Merged
merged 2 commits into from
Feb 11, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions lib/sycamore/sycamore/connectors/opensearch/opensearch_reader.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import logging
from copy import deepcopy

Expand Down Expand Up @@ -288,7 +289,7 @@ def __init__(
logger.info(f"OpenSearchReader using PIT: {self.use_pit}")

@timetrace("OpenSearchReader")
def _to_parent_doc(self, slice_query: dict[str, Any]) -> List[dict[str, Any]]:
def _to_parent_doc(self, doc: dict[str, Any]) -> List[dict[str, Any]]:
"""
Get all parent documents from a given slice.
"""
Expand All @@ -304,6 +305,7 @@ def _to_parent_doc(self, slice_query: dict[str, Any]) -> List[dict[str, Any]]:
raise ValueError("Target is not present\n" f"Parameters: {self._query_params}\n")

os_client = client._client
slice_query = json.loads(doc["doc"])

assert (
get_doc_count_for_slice(os_client, slice_query) < 10000
Expand Down Expand Up @@ -341,7 +343,7 @@ def _to_parent_doc(self, slice_query: dict[str, Any]) -> List[dict[str, Any]]:
logger.info(f"Read {len(results)} documents from {self._query_params.index_name}")

except Exception as e:
raise ValueError(f"Error reading from target: {e}")
raise ValueError(f"Error reading from target: {e}, query: {slice_query}")
finally:
if client is not None:
client.close()
Expand All @@ -350,7 +352,7 @@ def _to_parent_doc(self, slice_query: dict[str, Any]) -> List[dict[str, Any]]:
return ret

@timetrace("OpenSearchReader")
def _to_doc(self, slice_query: dict[str, Any]) -> List[dict[str, Any]]:
def _to_doc(self, doc: dict[str, Any]) -> List[dict[str, Any]]:
"""
Get all documents from a given slice.
"""
Expand All @@ -368,6 +370,7 @@ def _to_doc(self, slice_query: dict[str, Any]) -> List[dict[str, Any]]:
raise ValueError("Target is not present\n" f"Parameters: {self._query_params}\n")

os_client = client._client
slice_query = json.loads(doc["doc"])
slice_count = get_doc_count_for_slice(os_client, slice_query)
assert slice_count <= 10000, f"Slice count ({slice_count}) should return <= 10,000 documents"

Expand Down Expand Up @@ -547,7 +550,8 @@ def _execute_pit(self, **kwargs) -> "Dataset":
}
if "query" in query:
_query["query"] = query["query"]
docs.append(_query)

docs.append({"doc": json.dumps(_query)})
logger.debug(f"Added slice {i} to the query {_query}")
except Exception as e:
raise ValueError(f"Error reading from target: {e}")
Expand Down
Loading