Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Serialize query strings to avoid Ray Dataset column imputation #1171

Merged
merged 2 commits into from
Feb 11, 2025
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import logging
from copy import deepcopy

Expand Down Expand Up @@ -288,7 +289,7 @@ def __init__(
logger.info(f"OpenSearchReader using PIT: {self.use_pit}")

@timetrace("OpenSearchReader")
def _to_parent_doc(self, slice_query: dict[str, Any]) -> List[dict[str, Any]]:
def _to_parent_doc(self, doc: dict[str, Any]) -> List[dict[str, Any]]:
"""
Get all parent documents from a given slice.
"""
Expand All @@ -304,6 +305,7 @@ def _to_parent_doc(self, slice_query: dict[str, Any]) -> List[dict[str, Any]]:
raise ValueError("Target is not present\n" f"Parameters: {self._query_params}\n")

os_client = client._client
slice_query = json.loads(doc["doc"])

assert (
get_doc_count_for_slice(os_client, slice_query) < 10000
Expand Down Expand Up @@ -341,7 +343,7 @@ def _to_parent_doc(self, slice_query: dict[str, Any]) -> List[dict[str, Any]]:
logger.info(f"Read {len(results)} documents from {self._query_params.index_name}")

except Exception as e:
raise ValueError(f"Error reading from target: {e}")
raise ValueError(f"Error reading from target: {e}, query: {slice_query}")
finally:
if client is not None:
client.close()
Expand Down Expand Up @@ -547,7 +549,8 @@ def _execute_pit(self, **kwargs) -> "Dataset":
}
if "query" in query:
_query["query"] = query["query"]
docs.append(_query)

docs.append({"doc": json.dumps(_query)})
logger.debug(f"Added slice {i} to the query {_query}")
except Exception as e:
raise ValueError(f"Error reading from target: {e}")
Expand Down
Loading