Move df safeguard to affected term(s) only

msfroh · msfroh · commit c0ab4f747a34 · 2024-05-28T22:00:06.000Z
Signed-off-by: Michael Froh &lt;froh@amazon.com&gt;
diff --git a/server/src/main/java/org/apache/lucene/queries/BlendedTermQuery.java b/server/src/main/java/org/apache/lucene/queries/BlendedTermQuery.java
@@ -120,7 +120,7 @@ protected void blend(final TermStates[] contexts, int maxDoc, IndexReader reader
         }
         int max = 0;
         long minSumTTF = Long.MAX_VALUE;
-        int minDocCount = Integer.MAX_VALUE;
+        int[] docCounts = new int[contexts.length];
         for (int i = 0; i < contexts.length; i++) {
             TermStates ctx = contexts[i];
             int df = ctx.docFreq();
@@ -134,15 +134,12 @@ protected void blend(final TermStates[] contexts, int maxDoc, IndexReader reader
                 // we need to find out the minimum sumTTF to adjust the statistics
                 // otherwise the statistics don't match
                 minSumTTF = Math.min(minSumTTF, reader.getSumTotalTermFreq(terms[i].field()));
-                minDocCount = Math.min(minDocCount, reader.getDocCount(terms[i].field()));
+                docCounts[i] = reader.getDocCount(terms[i].field());
             }
         }
         if (maxDoc > minSumTTF) {
             maxDoc = (int) minSumTTF;
         }
-        if (maxDoc > minDocCount) {
-            maxDoc = minDocCount;
-        }
         if (max == 0) {
             return; // we are done that term doesn't exist at all
         }
@@ -180,7 +177,11 @@ protected int compare(int i, int j) {
             if (prev > current) {
                 actualDf++;
             }
-            contexts[i] = ctx = adjustDF(reader.getContext(), ctx, Math.min(maxDoc, actualDf));
+            // Per field, we want to guarantee that the adjusted df does not exceed the number of docs with the field.
+            // That is, in the IDF formula (log(1 + (N - n + 0.5) / (n + 0.5))), we need to make sure that n (the
+            // adjusted df) is never bigger than N (the number of docs with the field).
+            int fieldMaxDoc = Math.min(maxDoc, docCounts[i]);
+            contexts[i] = ctx = adjustDF(reader.getContext(), ctx, Math.min(fieldMaxDoc, actualDf));
             prev = current;
             sumTTF += ctx.totalTermFreq();
         }

Original file line number	Diff line number	Diff line change
`@@ -120,7 +120,7 @@ protected void blend(final TermStates[] contexts, int maxDoc, IndexReader reader`
`120`	`120`	`}`
`121`	`121`	`int max = 0;`
`122`	`122`	`long minSumTTF = Long.MAX_VALUE;`
`123`		`- int minDocCount = Integer.MAX_VALUE;`
	`123`	`+ int[] docCounts = new int[contexts.length];`
`124`	`124`	`for (int i = 0; i < contexts.length; i++) {`
`125`	`125`	`TermStates ctx = contexts[i];`
`126`	`126`	`int df = ctx.docFreq();`
`@@ -134,15 +134,12 @@ protected void blend(final TermStates[] contexts, int maxDoc, IndexReader reader`
`134`	`134`	`// we need to find out the minimum sumTTF to adjust the statistics`
`135`	`135`	`// otherwise the statistics don't match`
`136`	`136`	`minSumTTF = Math.min(minSumTTF, reader.getSumTotalTermFreq(terms[i].field()));`
`137`		`- minDocCount = Math.min(minDocCount, reader.getDocCount(terms[i].field()));`
	`137`	`+ docCounts[i] = reader.getDocCount(terms[i].field());`
`138`	`138`	`}`
`139`	`139`	`}`
`140`	`140`	`if (maxDoc > minSumTTF) {`
`141`	`141`	`maxDoc = (int) minSumTTF;`
`142`	`142`	`}`
`143`		`- if (maxDoc > minDocCount) {`
`144`		`- maxDoc = minDocCount;`
`145`		`- }`
`146`	`143`	`if (max == 0) {`
`147`	`144`	`return; // we are done that term doesn't exist at all`
`148`	`145`	`}`
`@@ -180,7 +177,11 @@ protected int compare(int i, int j) {`
`180`	`177`	`if (prev > current) {`
`181`	`178`	`actualDf++;`
`182`	`179`	`}`
`183`		`- contexts[i] = ctx = adjustDF(reader.getContext(), ctx, Math.min(maxDoc, actualDf));`
	`180`	`+ // Per field, we want to guarantee that the adjusted df does not exceed the number of docs with the field.`
	`181`	`+ // That is, in the IDF formula (log(1 + (N - n + 0.5) / (n + 0.5))), we need to make sure that n (the`
	`182`	`+ // adjusted df) is never bigger than N (the number of docs with the field).`
	`183`	`+ int fieldMaxDoc = Math.min(maxDoc, docCounts[i]);`
	`184`	`+ contexts[i] = ctx = adjustDF(reader.getContext(), ctx, Math.min(fieldMaxDoc, actualDf));`
`184`	`185`	`prev = current;`
`185`	`186`	`sumTTF += ctx.totalTermFreq();`
`186`	`187`	`}`