Skip to content

Commit

Permalink
Move df safeguard to affected term(s) only
Browse files Browse the repository at this point in the history
Signed-off-by: Michael Froh <[email protected]>
  • Loading branch information
msfroh committed May 28, 2024
1 parent 2353a42 commit c0ab4f7
Showing 1 changed file with 7 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ protected void blend(final TermStates[] contexts, int maxDoc, IndexReader reader
}
int max = 0;
long minSumTTF = Long.MAX_VALUE;
int minDocCount = Integer.MAX_VALUE;
int[] docCounts = new int[contexts.length];
for (int i = 0; i < contexts.length; i++) {
TermStates ctx = contexts[i];
int df = ctx.docFreq();
Expand All @@ -134,15 +134,12 @@ protected void blend(final TermStates[] contexts, int maxDoc, IndexReader reader
// we need to find out the minimum sumTTF to adjust the statistics
// otherwise the statistics don't match
minSumTTF = Math.min(minSumTTF, reader.getSumTotalTermFreq(terms[i].field()));
minDocCount = Math.min(minDocCount, reader.getDocCount(terms[i].field()));
docCounts[i] = reader.getDocCount(terms[i].field());
}
}
if (maxDoc > minSumTTF) {
maxDoc = (int) minSumTTF;
}
if (maxDoc > minDocCount) {
maxDoc = minDocCount;
}
if (max == 0) {
return; // we are done that term doesn't exist at all
}
Expand Down Expand Up @@ -180,7 +177,11 @@ protected int compare(int i, int j) {
if (prev > current) {
actualDf++;
}
contexts[i] = ctx = adjustDF(reader.getContext(), ctx, Math.min(maxDoc, actualDf));
// Per field, we want to guarantee that the adjusted df does not exceed the number of docs with the field.
// That is, in the IDF formula (log(1 + (N - n + 0.5) / (n + 0.5))), we need to make sure that n (the
// adjusted df) is never bigger than N (the number of docs with the field).
int fieldMaxDoc = Math.min(maxDoc, docCounts[i]);
contexts[i] = ctx = adjustDF(reader.getContext(), ctx, Math.min(fieldMaxDoc, actualDf));
prev = current;
sumTTF += ctx.totalTermFreq();
}
Expand Down

0 comments on commit c0ab4f7

Please sign in to comment.