From 147ae64ba727b359bc66f9cea5c1bd3e8cdd52d8 Mon Sep 17 00:00:00 2001 From: Shailesh Singh Date: Mon, 24 Feb 2025 12:47:49 +0530 Subject: [PATCH] Fix Bug - handle unsigned long in assertion of LongHashSet Signed-off-by: Shailesh Singh --- CHANGELOG.md | 2 + .../SortedUnsignedLongDocValuesSetQuery.java | 6 +- .../lucene/util/UnsignedLongHashSet.java | 139 +++++++++++++++++ .../TestDocValuesUnsignedLongHashSet.java | 145 ++++++++++++++++++ 4 files changed, 289 insertions(+), 3 deletions(-) create mode 100644 server/src/main/java/org/opensearch/lucene/util/UnsignedLongHashSet.java create mode 100644 server/src/test/java/org/opensearch/lucene/util/TestDocValuesUnsignedLongHashSet.java diff --git a/CHANGELOG.md b/CHANGELOG.md index 242a10d4ae6da..f23d4c78e918e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -41,6 +41,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Propagate the sourceIncludes and excludes fields from fetchSourceContext to FieldsVisitor. ([#17080](https://github.com/opensearch-project/OpenSearch/pull/17080)) - [Star Tree] [Search] Resolving Date histogram with metric aggregation using star-tree ([#16674](https://github.com/opensearch-project/OpenSearch/pull/16674)) - [Star Tree] [Search] Extensible design to support different query and field types ([#17137](https://github.com/opensearch-project/OpenSearch/pull/17137)) +- Fix Bug - Handle unsigned long in sorting order assertion of LongHashSet ([#17207](https://github.com/opensearch-project/OpenSearch/pull/17207)) + ### Dependencies - Bump `com.google.cloud:google-cloud-core-http` from 2.23.0 to 2.47.0 ([#16504](https://github.com/opensearch-project/OpenSearch/pull/16504)) diff --git a/server/src/main/java/org/opensearch/index/document/SortedUnsignedLongDocValuesSetQuery.java b/server/src/main/java/org/opensearch/index/document/SortedUnsignedLongDocValuesSetQuery.java index 7f4f47054207e..3d677aa6a8dfe 100644 --- a/server/src/main/java/org/opensearch/index/document/SortedUnsignedLongDocValuesSetQuery.java +++ b/server/src/main/java/org/opensearch/index/document/SortedUnsignedLongDocValuesSetQuery.java @@ -25,7 +25,7 @@ import org.apache.lucene.search.ScorerSupplier; import org.apache.lucene.search.TwoPhaseIterator; import org.apache.lucene.search.Weight; -import org.apache.lucene.util.LongHashSet; +import org.opensearch.lucene.util.UnsignedLongHashSet; import java.io.IOException; import java.math.BigInteger; @@ -40,12 +40,12 @@ public abstract class SortedUnsignedLongDocValuesSetQuery extends Query { private final String field; - private final LongHashSet numbers; + private final UnsignedLongHashSet numbers; SortedUnsignedLongDocValuesSetQuery(String field, BigInteger[] numbers) { this.field = Objects.requireNonNull(field); Arrays.sort(numbers); - this.numbers = new LongHashSet(Arrays.stream(numbers).mapToLong(n -> n.longValue()).toArray()); + this.numbers = new UnsignedLongHashSet(Arrays.stream(numbers).mapToLong(n -> n.longValue()).toArray()); } @Override diff --git a/server/src/main/java/org/opensearch/lucene/util/UnsignedLongHashSet.java b/server/src/main/java/org/opensearch/lucene/util/UnsignedLongHashSet.java new file mode 100644 index 0000000000000..aa1585d93aa5c --- /dev/null +++ b/server/src/main/java/org/opensearch/lucene/util/UnsignedLongHashSet.java @@ -0,0 +1,139 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.lucene.util; + +import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.packed.PackedInts; +import org.opensearch.common.Numbers; + +import java.util.Arrays; +import java.util.Objects; +import java.util.stream.Collectors; +import java.util.stream.LongStream; + +/** Set of unsigned-longs, optimized for docvalues usage */ +public final class UnsignedLongHashSet implements Accountable { + private static final long BASE_RAM_BYTES = RamUsageEstimator.shallowSizeOfInstance(UnsignedLongHashSet.class); + + private static final long MISSING = Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG; + + final long[] table; + final int mask; + final boolean hasMissingValue; + final int size; + /** minimum value in the set, or Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG for an empty set */ + public final long minValue; + /** maximum value in the set, or Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG for an empty set */ + public final long maxValue; + + /** Construct a set. Values must be in sorted order. */ + public UnsignedLongHashSet(long[] values) { + int tableSize = Math.toIntExact(values.length * 3L / 2); + tableSize = 1 << PackedInts.bitsRequired(tableSize); // make it a power of 2 + assert tableSize >= values.length * 3L / 2; + table = new long[tableSize]; + Arrays.fill(table, MISSING); + mask = tableSize - 1; + boolean hasMissingValue = false; + int size = 0; + long previousValue = Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG; // for assert + for (long value : values) { + if (value == MISSING) { + size += hasMissingValue ? 0 : 1; + hasMissingValue = true; + } else if (add(value)) { + ++size; + } + assert Long.compareUnsigned(value, previousValue) >= 0 : " values must be provided in sorted order"; + previousValue = value; + } + this.hasMissingValue = hasMissingValue; + this.size = size; + this.minValue = values.length == 0 ? Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG : values[0]; + this.maxValue = values.length == 0 ? Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG : values[values.length - 1]; + } + + private boolean add(long l) { + assert l != MISSING; + final int slot = Long.hashCode(l) & mask; + for (int i = slot;; i = (i + 1) & mask) { + if (table[i] == MISSING) { + table[i] = l; + return true; + } else if (table[i] == l) { + // already added + return false; + } + } + } + + /** + * check for membership in the set. + * + *

You should use {@link #minValue} and {@link #maxValue} to guide/terminate iteration before + * calling this. + */ + public boolean contains(long l) { + if (l == MISSING) { + return hasMissingValue; + } + final int slot = Long.hashCode(l) & mask; + for (int i = slot;; i = (i + 1) & mask) { + if (table[i] == MISSING) { + return false; + } else if (table[i] == l) { + return true; + } + } + } + + /** returns a stream of all values contained in this set */ + public LongStream stream() { + LongStream stream = Arrays.stream(table).filter(v -> v != MISSING); + if (hasMissingValue) { + stream = LongStream.concat(LongStream.of(MISSING), stream); + } + return stream; + } + + @Override + public int hashCode() { + return Objects.hash(size, minValue, maxValue, mask, hasMissingValue, Arrays.hashCode(table)); + } + + @Override + public boolean equals(Object obj) { + if (obj != null && obj instanceof UnsignedLongHashSet) { + UnsignedLongHashSet that = (UnsignedLongHashSet) obj; + return size == that.size + && minValue == that.minValue + && maxValue == that.maxValue + && mask == that.mask + && hasMissingValue == that.hasMissingValue + && Arrays.equals(table, that.table); + } + return false; + } + + @Override + public String toString() { + return stream().mapToObj(String::valueOf).collect(Collectors.joining(", ", "[", "]")); + } + + /** number of elements in the set */ + public int size() { + return size; + } + + @Override + public long ramBytesUsed() { + return BASE_RAM_BYTES + RamUsageEstimator.sizeOfObject(table); + } +} diff --git a/server/src/test/java/org/opensearch/lucene/util/TestDocValuesUnsignedLongHashSet.java b/server/src/test/java/org/opensearch/lucene/util/TestDocValuesUnsignedLongHashSet.java new file mode 100644 index 0000000000000..ecee4914f18c1 --- /dev/null +++ b/server/src/test/java/org/opensearch/lucene/util/TestDocValuesUnsignedLongHashSet.java @@ -0,0 +1,145 @@ +package org.opensearch.lucene.util; + +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + + +import org.apache.lucene.tests.util.LuceneTestCase; +import org.opensearch.common.Numbers; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.LongStream; + +public class TestDocValuesUnsignedLongHashSet extends LuceneTestCase { + + private void assertEquals(Set set1, UnsignedLongHashSet unsignedLongHashSet) { + assertEquals(set1.size(), unsignedLongHashSet.size()); + + Set set2 = unsignedLongHashSet.stream().boxed().collect(Collectors.toSet()); + LuceneTestCase.assertEquals(set1, set2); + + if (set1.isEmpty() == false) { + Set set3 = new HashSet<>(set1); + long removed = set3.iterator().next(); + while (true) { + long next = random().nextLong(); + if (next != removed && set3.add(next)) { + assertFalse(unsignedLongHashSet.contains(next)); + break; + } + } + assertNotEquals(set3, unsignedLongHashSet); + } + + assertTrue(set1.stream().allMatch(unsignedLongHashSet::contains)); + } + + private void assertNotEquals(Set set1, UnsignedLongHashSet unsignedLongHashSet) { + Set set2 = unsignedLongHashSet.stream().boxed().collect(Collectors.toSet()); + + LuceneTestCase.assertNotEquals(set1, set2); + + UnsignedLongHashSet set3 = new UnsignedLongHashSet( + set1.stream() + .sorted(Long::compareUnsigned) + .mapToLong(Long::longValue) + .toArray() + ); + + LuceneTestCase.assertNotEquals(set2, set3.stream().boxed().collect(Collectors.toSet())); + + assertFalse(set1.stream().allMatch(unsignedLongHashSet::contains)); + } + + public void testEmpty() { + Set set1 = new HashSet<>(); + UnsignedLongHashSet set2 = new UnsignedLongHashSet(new long[] {}); + assertEquals(0, set2.size()); + assertEquals(Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG, set2.minValue); + assertEquals(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, set2.maxValue); + assertEquals(set1, set2); + } + + public void testOneValue() { + Set set1 = new HashSet<>(Arrays.asList(42L)); + UnsignedLongHashSet set2 = new UnsignedLongHashSet(new long[] { 42L }); + assertEquals(1, set2.size()); + assertEquals(42L, set2.minValue); + assertEquals(42L, set2.maxValue); + assertEquals(set1, set2); + + set1 = new HashSet<>(Arrays.asList(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG)); + set2 = new UnsignedLongHashSet(new long[] { Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG }); + assertEquals(1, set2.size()); + assertEquals(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, set2.minValue); + assertEquals(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, set2.maxValue); + assertEquals(set1, set2); + } + + public void testTwoValues() { + Set set1 = new HashSet<>(Arrays.asList(42L, Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG)); + UnsignedLongHashSet set2 = new UnsignedLongHashSet(new long[] { 42L, Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG }); + assertEquals(2, set2.size()); + assertEquals(42, set2.minValue); + assertEquals(Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG, set2.maxValue); + assertEquals(set1, set2); + + set1 = new HashSet<>(Arrays.asList(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, 42L)); + set2 = new UnsignedLongHashSet(new long[] { Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, 42L }); + assertEquals(2, set2.size()); + assertEquals(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, set2.minValue); + assertEquals(42, set2.maxValue); + assertEquals(set1, set2); + } + + public void testSameValue() { + UnsignedLongHashSet set2 = new UnsignedLongHashSet(new long[] { 42L, 42L }); + assertEquals(1, set2.size()); + assertEquals(42L, set2.minValue); + assertEquals(42L, set2.maxValue); + } + + public void testSameMissingPlaceholder() { + UnsignedLongHashSet set2 = new UnsignedLongHashSet(new long[] { Long.MIN_VALUE, Long.MIN_VALUE }); + assertEquals(1, set2.size()); + assertEquals(Long.MIN_VALUE, set2.minValue); + assertEquals(Long.MIN_VALUE, set2.maxValue); + } + + public void testRandom() { + final int iters = atLeast(10); + for (int iter = 0; iter < iters; ++iter) { + long[] values = new long[random().nextInt(1 << random().nextInt(16))]; + for (int i = 0; i < values.length; ++i) { + if (i == 0 || random().nextInt(10) < 9) { + values[i] = random().nextLong(); + } else { + values[i] = values[random().nextInt(i)]; + } + } + if (values.length > 0 && random().nextBoolean()) { + values[values.length / 2] = Long.MIN_VALUE; + } + Set set1 = LongStream.of(values).boxed().collect(Collectors.toSet()); + Long[] longObjects = Arrays.stream(values).boxed().toArray(Long[]::new); + // Sort using compareUnsigned + Arrays.sort(longObjects, Long::compareUnsigned); + + long[] arr = new long[values.length]; + // Convert back to long[] + for (int i = 0; i < arr.length; i++) { + arr[i] = longObjects[i]; + } + UnsignedLongHashSet set2 = new UnsignedLongHashSet(arr); + assertEquals(set1, set2); + } + } +}