Skip to content

Commit 28dd0f5

Browse files
author
Rob Pasternak
authored
feat: Add options for what to do with missing metadata fields in MetaFieldRanker (#7700)
* Add `missing_meta` param to `MetaFieldRanker`, plus checks for validation. * Implement `missing_meta` functionality in `run()`. * Finish first draft of revised `MetaFieldRanker` functionality. * Add tests for `MetaFieldRanker` `missing_meta` functionality. * Add `missing_meta` param to `MetaFieldRanker`, plus checks for validation. * Implement `missing_meta` functionality in `run()`. * Finish first draft of revised `MetaFieldRanker` functionality. * Add tests for `MetaFieldRanker` `missing_meta` functionality. * Add release notes for new `missing_meta` param of `MetaFieldRanker` * Move part of docs_missing_meta_field warning string outside of `if...elif...else`.
1 parent 14c7b02 commit 28dd0f5

File tree

3 files changed

+109
-8
lines changed

3 files changed

+109
-8
lines changed

Diff for: haystack/components/rankers/meta_field.py

+62-8
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ def __init__(
4343
top_k: Optional[int] = None,
4444
ranking_mode: Literal["reciprocal_rank_fusion", "linear_score"] = "reciprocal_rank_fusion",
4545
sort_order: Literal["ascending", "descending"] = "descending",
46+
missing_meta: Literal["drop", "top", "bottom"] = "bottom",
4647
meta_value_type: Optional[Literal["float", "int", "date"]] = None,
4748
):
4849
"""
@@ -65,6 +66,14 @@ def __init__(
6566
:param sort_order:
6667
Whether to sort the meta field by ascending or descending order.
6768
Possible values are `descending` (default) and `ascending`.
69+
:param missing_meta:
70+
What to do with documents that are missing the sorting metadata field.
71+
Possible values are:
72+
- 'drop' will drop the documents entirely.
73+
- 'top' will place the documents at the top of the metadata-sorted list
74+
(regardless of 'ascending' or 'descending').
75+
- 'bottom' will place the documents at the bottom of metadata-sorted list
76+
(regardless of 'ascending' or 'descending').
6877
:param meta_value_type:
6978
Parse the meta value into the data type specified before sorting.
7079
This will only work if all meta values stored under `meta_field` in the provided documents are strings.
@@ -82,11 +91,13 @@ def __init__(
8291
self.top_k = top_k
8392
self.ranking_mode = ranking_mode
8493
self.sort_order = sort_order
94+
self.missing_meta = missing_meta
8595
self._validate_params(
8696
weight=self.weight,
8797
top_k=self.top_k,
8898
ranking_mode=self.ranking_mode,
8999
sort_order=self.sort_order,
100+
missing_meta=self.missing_meta,
90101
meta_value_type=meta_value_type,
91102
)
92103
self.meta_value_type = meta_value_type
@@ -97,6 +108,7 @@ def _validate_params(
97108
top_k: Optional[int],
98109
ranking_mode: Literal["reciprocal_rank_fusion", "linear_score"],
99110
sort_order: Literal["ascending", "descending"],
111+
missing_meta: Literal["drop", "top", "bottom"],
100112
meta_value_type: Optional[Literal["float", "int", "date"]],
101113
):
102114
if top_k is not None and top_k <= 0:
@@ -125,6 +137,14 @@ def _validate_params(
125137
"MetaFieldRanker." % sort_order
126138
)
127139

140+
if missing_meta not in ["drop", "top", "bottom"]:
141+
raise ValueError(
142+
"The value of parameter <missing_meta> must be 'drop', 'top', or 'bottom', "
143+
"but is currently set to '%s'.\n"
144+
"Change the <missing_meta> value to 'drop', 'top', or 'bottom' when initializing the "
145+
"MetaFieldRanker." % missing_meta
146+
)
147+
128148
if meta_value_type not in ["float", "int", "date", None]:
129149
raise ValueError(
130150
"The value of parameter <meta_value_type> must be 'float', 'int', 'date' or None but is "
@@ -141,6 +161,7 @@ def run(
141161
weight: Optional[float] = None,
142162
ranking_mode: Optional[Literal["reciprocal_rank_fusion", "linear_score"]] = None,
143163
sort_order: Optional[Literal["ascending", "descending"]] = None,
164+
missing_meta: Optional[Literal["drop", "top", "bottom"]] = None,
144165
meta_value_type: Optional[Literal["float", "int", "date"]] = None,
145166
):
146167
"""
@@ -171,6 +192,15 @@ def run(
171192
Whether to sort the meta field by ascending or descending order.
172193
Possible values are `descending` (default) and `ascending`.
173194
If not provided, the sort_order provided at initialization time is used.
195+
:param missing_meta:
196+
What to do with documents that are missing the sorting metadata field.
197+
Possible values are:
198+
- 'drop' will drop the documents entirely.
199+
- 'top' will place the documents at the top of the metadata-sorted list
200+
(regardless of 'ascending' or 'descending').
201+
- 'bottom' will place the documents at the bottom of metadata-sorted list
202+
(regardless of 'ascending' or 'descending').
203+
If not provided, the missing_meta provided at initialization time is used.
174204
:param meta_value_type:
175205
Parse the meta value into the data type specified before sorting.
176206
This will only work if all meta values stored under `meta_field` in the provided documents are strings.
@@ -199,12 +229,14 @@ def run(
199229
weight = weight if weight is not None else self.weight
200230
ranking_mode = ranking_mode or self.ranking_mode
201231
sort_order = sort_order or self.sort_order
232+
missing_meta = missing_meta or self.missing_meta
202233
meta_value_type = meta_value_type or self.meta_value_type
203234
self._validate_params(
204235
weight=weight,
205236
top_k=top_k,
206237
ranking_mode=ranking_mode,
207238
sort_order=sort_order,
239+
missing_meta=missing_meta,
208240
meta_value_type=meta_value_type,
209241
)
210242

@@ -227,13 +259,27 @@ def run(
227259
return {"documents": documents[:top_k]}
228260

229261
if len(docs_missing_meta_field) > 0:
230-
logger.warning(
231-
"The parameter <meta_field> is currently set to '{meta_field}' but the Documents with IDs {document_ids} don't have this meta key.\n"
232-
"These Documents will be placed at the end of the sorting order.",
233-
meta_field=self.meta_field,
234-
document_ids=",".join([doc.id for doc in docs_missing_meta_field]),
262+
warning_start = (
263+
f"The parameter <meta_field> is currently set to '{self.meta_field}' but the Documents "
264+
f"with IDs {','.join([doc.id for doc in docs_missing_meta_field])} don't have this meta key.\n"
235265
)
236266

267+
if missing_meta == "bottom":
268+
logger.warning(
269+
"{warning_start}Because the parameter <missing_meta> is set to 'bottom', these Documents will be placed at the end of the sorting order.",
270+
warning_start=warning_start,
271+
)
272+
elif missing_meta == "top":
273+
logger.warning(
274+
"{warning_start}Because the parameter <missing_meta> is set to 'top', these Documents will be placed at the top of the sorting order.",
275+
warning_start=warning_start,
276+
)
277+
else:
278+
logger.warning(
279+
"{warning_start}Because the parameter <missing_meta> is set to 'drop', these Documents will be removed from the list of retrieved Documents.",
280+
warning_start=warning_start,
281+
)
282+
237283
# If meta_value_type is provided try to parse the meta values
238284
parsed_meta = self._parse_meta(docs_with_meta_field=docs_with_meta_field, meta_value_type=meta_value_type)
239285
tuple_parsed_meta_and_docs = list(zip(parsed_meta, docs_with_meta_field))
@@ -252,10 +298,18 @@ def run(
252298
)
253299
return {"documents": documents[:top_k]}
254300

255-
# Add the docs missing the meta_field back on the end
301+
# Merge rankings and handle missing meta fields as specified in the missing_meta parameter
256302
sorted_by_meta = [doc for meta, doc in tuple_sorted_by_meta]
257-
sorted_documents = sorted_by_meta + docs_missing_meta_field
258-
sorted_documents = self._merge_rankings(documents, sorted_documents, weight, ranking_mode)
303+
if missing_meta == "bottom":
304+
sorted_documents = sorted_by_meta + docs_missing_meta_field
305+
sorted_documents = self._merge_rankings(documents, sorted_documents, weight, ranking_mode)
306+
elif missing_meta == "top":
307+
sorted_documents = docs_missing_meta_field + sorted_by_meta
308+
sorted_documents = self._merge_rankings(documents, sorted_documents, weight, ranking_mode)
309+
else:
310+
sorted_documents = sorted_by_meta
311+
sorted_documents = self._merge_rankings(docs_with_meta_field, sorted_documents, weight, ranking_mode)
312+
259313
return {"documents": sorted_documents[:top_k]}
260314

261315
def _parse_meta(
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
features:
3+
- |
4+
Add a new `missing_meta` param to `MetaFieldRanker`, which determines what to do with
5+
documents that lack the ranked meta field. Supported values are `"bottom"` (which
6+
puts documents with missing meta at the bottom of the sorted list), `"top"` (which puts them
7+
at the top), and `"drop"` (which removes them from the results entirely).

Diff for: test/components/rankers/test_metafield.py

+40
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,10 @@ def test_raises_value_error_if_wrong_sort_order(self):
175175
with pytest.raises(ValueError):
176176
MetaFieldRanker(meta_field="rating", sort_order="wrong_order")
177177

178+
def test_raises_value_error_if_wrong_missing_meta(self):
179+
with pytest.raises(ValueError):
180+
MetaFieldRanker(meta_field="rating", missing_meta="wrong_missing_meta")
181+
178182
def test_raises_value_error_if_wrong_meta_value_type(self):
179183
with pytest.raises(ValueError):
180184
MetaFieldRanker(meta_field="rating", meta_value_type="wrong_type")
@@ -239,3 +243,39 @@ def test_different_ranking_mode_for_init_vs_run(self):
239243
output = ranker.run(documents=docs_before, ranking_mode="reciprocal_rank_fusion")
240244
docs_after = output["documents"]
241245
assert docs_after[0].score == pytest.approx(0.016261, abs=1e-5)
246+
247+
def test_missing_meta_bottom(self):
248+
ranker = MetaFieldRanker(meta_field="rating", ranking_mode="linear_score", weight=0.5, missing_meta="bottom")
249+
docs_before = [
250+
Document(id="1", content="abc", meta={"rating": 1.3}, score=0.6),
251+
Document(id="2", content="abc", meta={}, score=0.4),
252+
Document(id="3", content="abc", meta={"rating": 2.1}, score=0.39),
253+
]
254+
output = ranker.run(documents=docs_before)
255+
docs_after = output["documents"]
256+
assert len(docs_after) == 3
257+
assert docs_after[2].id == "2"
258+
259+
def test_missing_meta_top(self):
260+
ranker = MetaFieldRanker(meta_field="rating", ranking_mode="linear_score", weight=0.5, missing_meta="top")
261+
docs_before = [
262+
Document(id="1", content="abc", meta={"rating": 1.3}, score=0.6),
263+
Document(id="2", content="abc", meta={}, score=0.59),
264+
Document(id="3", content="abc", meta={"rating": 2.1}, score=0.4),
265+
]
266+
output = ranker.run(documents=docs_before)
267+
docs_after = output["documents"]
268+
assert len(docs_after) == 3
269+
assert docs_after[0].id == "2"
270+
271+
def test_missing_meta_drop(self):
272+
ranker = MetaFieldRanker(meta_field="rating", ranking_mode="linear_score", weight=0.5, missing_meta="drop")
273+
docs_before = [
274+
Document(id="1", content="abc", meta={"rating": 1.3}, score=0.6),
275+
Document(id="2", content="abc", meta={}, score=0.59),
276+
Document(id="3", content="abc", meta={"rating": 2.1}, score=0.4),
277+
]
278+
output = ranker.run(documents=docs_before)
279+
docs_after = output["documents"]
280+
assert len(docs_after) == 2
281+
assert "2" not in [doc.id for doc in docs_after]

0 commit comments

Comments
 (0)