Skip to content

Commit c0ab4f7

Browse files
committed
Move df safeguard to affected term(s) only
Signed-off-by: Michael Froh <[email protected]>
1 parent 2353a42 commit c0ab4f7

File tree

1 file changed

+7
-6
lines changed

1 file changed

+7
-6
lines changed

server/src/main/java/org/apache/lucene/queries/BlendedTermQuery.java

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ protected void blend(final TermStates[] contexts, int maxDoc, IndexReader reader
120120
}
121121
int max = 0;
122122
long minSumTTF = Long.MAX_VALUE;
123-
int minDocCount = Integer.MAX_VALUE;
123+
int[] docCounts = new int[contexts.length];
124124
for (int i = 0; i < contexts.length; i++) {
125125
TermStates ctx = contexts[i];
126126
int df = ctx.docFreq();
@@ -134,15 +134,12 @@ protected void blend(final TermStates[] contexts, int maxDoc, IndexReader reader
134134
// we need to find out the minimum sumTTF to adjust the statistics
135135
// otherwise the statistics don't match
136136
minSumTTF = Math.min(minSumTTF, reader.getSumTotalTermFreq(terms[i].field()));
137-
minDocCount = Math.min(minDocCount, reader.getDocCount(terms[i].field()));
137+
docCounts[i] = reader.getDocCount(terms[i].field());
138138
}
139139
}
140140
if (maxDoc > minSumTTF) {
141141
maxDoc = (int) minSumTTF;
142142
}
143-
if (maxDoc > minDocCount) {
144-
maxDoc = minDocCount;
145-
}
146143
if (max == 0) {
147144
return; // we are done that term doesn't exist at all
148145
}
@@ -180,7 +177,11 @@ protected int compare(int i, int j) {
180177
if (prev > current) {
181178
actualDf++;
182179
}
183-
contexts[i] = ctx = adjustDF(reader.getContext(), ctx, Math.min(maxDoc, actualDf));
180+
// Per field, we want to guarantee that the adjusted df does not exceed the number of docs with the field.
181+
// That is, in the IDF formula (log(1 + (N - n + 0.5) / (n + 0.5))), we need to make sure that n (the
182+
// adjusted df) is never bigger than N (the number of docs with the field).
183+
int fieldMaxDoc = Math.min(maxDoc, docCounts[i]);
184+
contexts[i] = ctx = adjustDF(reader.getContext(), ctx, Math.min(fieldMaxDoc, actualDf));
184185
prev = current;
185186
sumTTF += ctx.totalTermFreq();
186187
}

0 commit comments

Comments
 (0)