Avoid negative scores returned from multi_match query with cross_fields

msfroh · msfroh · commit 2353a421d7ef · 2024-05-25T02:42:17.000Z
Under specific circumstances, when using `cross_fields` scoring on a
`multi_match` query, we can end up with negative scores from the inverse
document frequency calculation in the BM25 formula.

Specifically, the IDF is calculated as:

```
log(1 + (N - n + 0.5) / (n + 0.5))
```

where `N` is the number of documents containing the field and `n` is the
number of documents containing the given term in the field. Obviously,
`n` should always be less than or equal to `N`.

Unfortunately, `cross_fields` makes up a new value for `n` and tries to
use it across all fields.

This change finds the minimum (nonzero) value of `N` and uses that as an
upper bound for the new value of `n`.

Signed-off-by: Michael Froh &lt;froh@amazon.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -41,6 +41,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Fix get field mapping API returns 404 error in mixed cluster with multiple versions ([#13624](https://github.com/opensearch-project/OpenSearch/pull/13624))
 - Allow clearing `remote_store.compatibility_mode` setting ([#13646](https://github.com/opensearch-project/OpenSearch/pull/13646))
 - Fix ReplicaShardBatchAllocator to batch shards without duplicates ([#13710](https://github.com/opensearch-project/OpenSearch/pull/13710))
+- Don't return negative scores from `multi_match` query with `cross_fields` type  ([#13829](https://github.com/opensearch-project/OpenSearch/pull/13829))
 
 ### Security
 
diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/search/50_multi_match.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/search/50_multi_match.yml
@@ -0,0 +1,32 @@
+"Cross fields do not return negative scores":
+  - do:
+      index:
+        index: test
+        id: 1
+        body: { "color" : "orange red yellow" }
+  - do:
+      index:
+        index: test
+        id: 2
+        body: { "color": "orange red purple", "shape": "red square" }
+  - do:
+      index:
+        index: test
+        id: 3
+        body: { "color" : "orange red yellow purple" }
+  - do:
+      indices.refresh: { }
+  - do:
+      search:
+        index: test
+        body:
+          query:
+            multi_match:
+              query: "red"
+              type: "cross_fields"
+              fields: [ "color", "shape^100"]
+              tie_breaker: 0.1
+          explain: true
+  - match: { hits.total.value: 3 }
+  - match: { hits.hits.0._id: "2" }
+  - gt: { hits.hits.2._score: 0.0 }
diff --git a/server/src/main/java/org/apache/lucene/queries/BlendedTermQuery.java b/server/src/main/java/org/apache/lucene/queries/BlendedTermQuery.java
@@ -120,6 +120,7 @@ protected void blend(final TermStates[] contexts, int maxDoc, IndexReader reader
         }
         int max = 0;
         long minSumTTF = Long.MAX_VALUE;
+        int minDocCount = Integer.MAX_VALUE;
         for (int i = 0; i < contexts.length; i++) {
             TermStates ctx = contexts[i];
             int df = ctx.docFreq();
@@ -133,11 +134,15 @@ protected void blend(final TermStates[] contexts, int maxDoc, IndexReader reader
                 // we need to find out the minimum sumTTF to adjust the statistics
                 // otherwise the statistics don't match
                 minSumTTF = Math.min(minSumTTF, reader.getSumTotalTermFreq(terms[i].field()));
+                minDocCount = Math.min(minDocCount, reader.getDocCount(terms[i].field()));
             }
         }
         if (maxDoc > minSumTTF) {
             maxDoc = (int) minSumTTF;
         }
+        if (maxDoc > minDocCount) {
+            maxDoc = minDocCount;
+        }
         if (max == 0) {
             return; // we are done that term doesn't exist at all
         }

Original file line number	Diff line number	Diff line change
`@@ -120,6 +120,7 @@ protected void blend(final TermStates[] contexts, int maxDoc, IndexReader reader`
`120`	`120`	`}`
`121`	`121`	`int max = 0;`
`122`	`122`	`long minSumTTF = Long.MAX_VALUE;`
	`123`	`+ int minDocCount = Integer.MAX_VALUE;`
`123`	`124`	`for (int i = 0; i < contexts.length; i++) {`
`124`	`125`	`TermStates ctx = contexts[i];`
`125`	`126`	`int df = ctx.docFreq();`
`@@ -133,11 +134,15 @@ protected void blend(final TermStates[] contexts, int maxDoc, IndexReader reader`
`133`	`134`	`// we need to find out the minimum sumTTF to adjust the statistics`
`134`	`135`	`// otherwise the statistics don't match`
`135`	`136`	`minSumTTF = Math.min(minSumTTF, reader.getSumTotalTermFreq(terms[i].field()));`
	`137`	`+ minDocCount = Math.min(minDocCount, reader.getDocCount(terms[i].field()));`
`136`	`138`	`}`
`137`	`139`	`}`
`138`	`140`	`if (maxDoc > minSumTTF) {`
`139`	`141`	`maxDoc = (int) minSumTTF;`
`140`	`142`	`}`
	`143`	`+ if (maxDoc > minDocCount) {`
	`144`	`+ maxDoc = minDocCount;`
	`145`	`+ }`
`141`	`146`	`if (max == 0) {`
`142`	`147`	`return; // we are done that term doesn't exist at all`
`143`	`148`	`}`