From ee74c5916b06a1277918d96e89f9573b49f11859 Mon Sep 17 00:00:00 2001 From: Michael Froh Date: Sat, 25 May 2024 02:25:33 +0000 Subject: [PATCH] Avoid negative scores returned from multi_match query with cross_fields Under specific circumstances, when using `cross_fields` scoring on a `multi_match` query, we can end up with negative scores from the inverse document frequency calculation in the BM25 formula. Specifically, the IDF is calculated as: ``` log(1 + (N - n + 0.5) / (n + 0.5)) ``` where `N` is the number of documents containing the field and `n` is the number of documents containing the given term in the field. Obviously, `n` should always be less than or equal to `N`. Unfortunately, `cross_fields` makes up a new value for `n` and tries to use it across all fields. This change finds the minimum (nonzero) value of `N` and uses that as an upper bound for the new value of `n`. Signed-off-by: Michael Froh --- CHANGELOG.md | 1 + .../test/search/50_multi_match.yml | 32 +++++++++++++++++++ .../lucene/queries/BlendedTermQuery.java | 5 +++ 3 files changed, 38 insertions(+) create mode 100644 rest-api-spec/src/main/resources/rest-api-spec/test/search/50_multi_match.yml diff --git a/CHANGELOG.md b/CHANGELOG.md index 5416005e4706c..7be2080d12227 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -45,6 +45,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Fix get field mapping API returns 404 error in mixed cluster with multiple versions ([#13624](https://github.com/opensearch-project/OpenSearch/pull/13624)) - Allow clearing `remote_store.compatibility_mode` setting ([#13646](https://github.com/opensearch-project/OpenSearch/pull/13646)) - Fix ReplicaShardBatchAllocator to batch shards without duplicates ([#13710](https://github.com/opensearch-project/OpenSearch/pull/13710)) +- Don't return negative scores from `multi_match` query with `cross_fields` type ([#13829](https://github.com/opensearch-project/OpenSearch/pull/13829)) - Pass parent filter to inner hit query ([#13770](https://github.com/opensearch-project/OpenSearch/pull/13770)) ### Security diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/search/50_multi_match.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/search/50_multi_match.yml new file mode 100644 index 0000000000000..b20438c45b33d --- /dev/null +++ b/rest-api-spec/src/main/resources/rest-api-spec/test/search/50_multi_match.yml @@ -0,0 +1,32 @@ +"Cross fields do not return negative scores": + - do: + index: + index: test + id: 1 + body: { "color" : "orange red yellow" } + - do: + index: + index: test + id: 2 + body: { "color": "orange red purple", "shape": "red square" } + - do: + index: + index: test + id: 3 + body: { "color" : "orange red yellow purple" } + - do: + indices.refresh: { } + - do: + search: + index: test + body: + query: + multi_match: + query: "red" + type: "cross_fields" + fields: [ "color", "shape^100"] + tie_breaker: 0.1 + explain: true + - match: { hits.total.value: 3 } + - match: { hits.hits.0._id: "2" } + - gt: { hits.hits.2._score: 0.0 } diff --git a/server/src/main/java/org/apache/lucene/queries/BlendedTermQuery.java b/server/src/main/java/org/apache/lucene/queries/BlendedTermQuery.java index b47b974b96fed..17419804b1ffa 100644 --- a/server/src/main/java/org/apache/lucene/queries/BlendedTermQuery.java +++ b/server/src/main/java/org/apache/lucene/queries/BlendedTermQuery.java @@ -120,6 +120,7 @@ protected void blend(final TermStates[] contexts, int maxDoc, IndexReader reader } int max = 0; long minSumTTF = Long.MAX_VALUE; + int minDocCount = Integer.MAX_VALUE; for (int i = 0; i < contexts.length; i++) { TermStates ctx = contexts[i]; int df = ctx.docFreq(); @@ -133,11 +134,15 @@ protected void blend(final TermStates[] contexts, int maxDoc, IndexReader reader // we need to find out the minimum sumTTF to adjust the statistics // otherwise the statistics don't match minSumTTF = Math.min(minSumTTF, reader.getSumTotalTermFreq(terms[i].field())); + minDocCount = Math.min(minDocCount, reader.getDocCount(terms[i].field())); } } if (maxDoc > minSumTTF) { maxDoc = (int) minSumTTF; } + if (maxDoc > minDocCount) { + maxDoc = minDocCount; + } if (max == 0) { return; // we are done that term doesn't exist at all }