Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Bug - Handle unsigned long in sorting order assertion of LongHashSet #17207

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- Introduce a setting to disable download of full cluster state from remote on term mismatch([#16798](https://github.com/opensearch-project/OpenSearch/pull/16798/))
- Added ability to retrieve value from DocValues in a flat_object filed([#16802](https://github.com/opensearch-project/OpenSearch/pull/16802))
- Improve performace of NumericTermAggregation by avoiding unnecessary sorting([#17252](https://github.com/opensearch-project/OpenSearch/pull/17252))
- Fix Bug - Handle unsigned long in sorting order assertion of LongHashSet ([#17207](https://github.com/opensearch-project/OpenSearch/pull/17207))
- [Rule Based Auto-tagging] Add in-memory attribute value store ([#17342](https://github.com/opensearch-project/OpenSearch/pull/17342))

### Dependencies
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
import org.apache.lucene.search.ScorerSupplier;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.search.Weight;
import org.opensearch.lucene.util.LongHashSet;
import org.opensearch.lucene.util.UnsignedLongHashSet;

import java.io.IOException;
import java.math.BigInteger;
Expand All @@ -40,12 +40,12 @@
public abstract class SortedUnsignedLongDocValuesSetQuery extends Query {

private final String field;
private final LongHashSet numbers;
private final UnsignedLongHashSet numbers;

SortedUnsignedLongDocValuesSetQuery(String field, BigInteger[] numbers) {
this.field = Objects.requireNonNull(field);
Arrays.sort(numbers);
this.numbers = new LongHashSet(Arrays.stream(numbers).mapToLong(n -> n.longValue()).toArray());
this.numbers = new UnsignedLongHashSet(Arrays.stream(numbers).mapToLong(n -> n.longValue()).toArray());
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

package org.opensearch.lucene.util;

import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.packed.PackedInts;
import org.opensearch.common.Numbers;

import java.util.Arrays;
import java.util.Objects;
import java.util.stream.Collectors;
import java.util.stream.LongStream;

/** Set of unsigned-longs, optimized for docvalues usage */
public final class UnsignedLongHashSet implements Accountable {
private static final long BASE_RAM_BYTES = RamUsageEstimator.shallowSizeOfInstance(UnsignedLongHashSet.class);

private static final long MISSING = Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG;

final long[] table;
final int mask;
final boolean hasMissingValue;
final int size;
/** minimum value in the set, or Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG for an empty set */
public final long minValue;
/** maximum value in the set, or Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG for an empty set */
public final long maxValue;

/** Construct a set. Values must be in sorted order. */
public UnsignedLongHashSet(long[] values) {
int tableSize = Math.toIntExact(values.length * 3L / 2);
tableSize = 1 << PackedInts.bitsRequired(tableSize); // make it a power of 2
assert tableSize >= values.length * 3L / 2;
table = new long[tableSize];
Arrays.fill(table, MISSING);
mask = tableSize - 1;
boolean hasMissingValue = false;
int size = 0;
long previousValue = Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG; // for assert
for (long value : values) {
if (value == MISSING) {
size += hasMissingValue ? 0 : 1;
hasMissingValue = true;
} else if (add(value)) {
++size;
}
assert Long.compareUnsigned(value, previousValue) >= 0 : " values must be provided in sorted order";
previousValue = value;
}
this.hasMissingValue = hasMissingValue;
this.size = size;
this.minValue = values.length == 0 ? Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG : values[0];
this.maxValue = values.length == 0 ? Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG : values[values.length - 1];
}

private boolean add(long l) {
assert l != MISSING;
final int slot = Long.hashCode(l) & mask;
for (int i = slot;; i = (i + 1) & mask) {
if (table[i] == MISSING) {
table[i] = l;
return true;
} else if (table[i] == l) {
// already added
return false;
}
}
}

/**
* check for membership in the set.
*
* <p>You should use {@link #minValue} and {@link #maxValue} to guide/terminate iteration before
* calling this.
*/
public boolean contains(long l) {
if (l == MISSING) {
return hasMissingValue;
}
final int slot = Long.hashCode(l) & mask;
for (int i = slot;; i = (i + 1) & mask) {
if (table[i] == MISSING) {
return false;
} else if (table[i] == l) {
return true;
}
}
}

/** returns a stream of all values contained in this set */
public LongStream stream() {
LongStream stream = Arrays.stream(table).filter(v -> v != MISSING);
if (hasMissingValue) {
stream = LongStream.concat(LongStream.of(MISSING), stream);
}
return stream;
}

@Override
public int hashCode() {
return Objects.hash(size, minValue, maxValue, mask, hasMissingValue, Arrays.hashCode(table));
}

@Override
public boolean equals(Object obj) {
if (obj != null && obj instanceof UnsignedLongHashSet) {
UnsignedLongHashSet that = (UnsignedLongHashSet) obj;
return size == that.size
&& minValue == that.minValue
&& maxValue == that.maxValue
&& mask == that.mask
&& hasMissingValue == that.hasMissingValue
&& Arrays.equals(table, that.table);
}
return false;
}

@Override
public String toString() {
return stream().mapToObj(String::valueOf).collect(Collectors.joining(", ", "[", "]"));
}

/** number of elements in the set */
public int size() {
return size;
}

@Override
public long ramBytesUsed() {
return BASE_RAM_BYTES + RamUsageEstimator.sizeOfObject(table);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
package org.opensearch.lucene.util;

/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

import org.apache.lucene.tests.util.LuceneTestCase;
import org.opensearch.common.Numbers;

import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.LongStream;

public class TestDocValuesUnsignedLongHashSet extends LuceneTestCase {

private void assertEquals(Set<Long> set1, UnsignedLongHashSet unsignedLongHashSet) {
assertEquals(set1.size(), unsignedLongHashSet.size());

Set<Long> set2 = unsignedLongHashSet.stream().boxed().collect(Collectors.toSet());
LuceneTestCase.assertEquals(set1, set2);

if (set1.isEmpty() == false) {
Set<Long> set3 = new HashSet<>(set1);
long removed = set3.iterator().next();
while (true) {
long next = random().nextLong();
if (next != removed && set3.add(next)) {
assertFalse(unsignedLongHashSet.contains(next));
break;
}
}
assertNotEquals(set3, unsignedLongHashSet);
}

assertTrue(set1.stream().allMatch(unsignedLongHashSet::contains));
}

private void assertNotEquals(Set<Long> set1, UnsignedLongHashSet unsignedLongHashSet) {
Set<Long> set2 = unsignedLongHashSet.stream().boxed().collect(Collectors.toSet());

LuceneTestCase.assertNotEquals(set1, set2);

UnsignedLongHashSet set3 = new UnsignedLongHashSet(
set1.stream().sorted(Long::compareUnsigned).mapToLong(Long::longValue).toArray()
);

LuceneTestCase.assertNotEquals(set2, set3.stream().boxed().collect(Collectors.toSet()));

assertFalse(set1.stream().allMatch(unsignedLongHashSet::contains));
}

public void testEmpty() {
Set<Long> set1 = new HashSet<>();
UnsignedLongHashSet set2 = new UnsignedLongHashSet(new long[] {});
assertEquals(0, set2.size());
assertEquals(Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG, set2.minValue);
assertEquals(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, set2.maxValue);
assertEquals(set1, set2);
}

public void testOneValue() {
Set<Long> set1 = new HashSet<>(Arrays.asList(42L));
UnsignedLongHashSet set2 = new UnsignedLongHashSet(new long[] { 42L });
assertEquals(1, set2.size());
assertEquals(42L, set2.minValue);
assertEquals(42L, set2.maxValue);
assertEquals(set1, set2);

set1 = new HashSet<>(Arrays.asList(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG));
set2 = new UnsignedLongHashSet(new long[] { Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG });
assertEquals(1, set2.size());
assertEquals(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, set2.minValue);
assertEquals(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, set2.maxValue);
assertEquals(set1, set2);
}

public void testTwoValues() {
Set<Long> set1 = new HashSet<>(Arrays.asList(42L, Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG));
UnsignedLongHashSet set2 = new UnsignedLongHashSet(new long[] { 42L, Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG });
assertEquals(2, set2.size());
assertEquals(42, set2.minValue);
assertEquals(Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG, set2.maxValue);
assertEquals(set1, set2);

set1 = new HashSet<>(Arrays.asList(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, 42L));
set2 = new UnsignedLongHashSet(new long[] { Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, 42L });
assertEquals(2, set2.size());
assertEquals(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, set2.minValue);
assertEquals(42, set2.maxValue);
assertEquals(set1, set2);
}

public void testSameValue() {
UnsignedLongHashSet set2 = new UnsignedLongHashSet(new long[] { 42L, 42L });
assertEquals(1, set2.size());
assertEquals(42L, set2.minValue);
assertEquals(42L, set2.maxValue);
}

public void testSameMissingPlaceholder() {
UnsignedLongHashSet set2 = new UnsignedLongHashSet(new long[] { Long.MIN_VALUE, Long.MIN_VALUE });
assertEquals(1, set2.size());
assertEquals(Long.MIN_VALUE, set2.minValue);
assertEquals(Long.MIN_VALUE, set2.maxValue);
}

public void testRandom() {
final int iters = atLeast(10);
for (int iter = 0; iter < iters; ++iter) {
long[] values = new long[random().nextInt(1 << random().nextInt(16))];
for (int i = 0; i < values.length; ++i) {
if (i == 0 || random().nextInt(10) < 9) {
values[i] = random().nextLong();
} else {
values[i] = values[random().nextInt(i)];
}
}
if (values.length > 0 && random().nextBoolean()) {
values[values.length / 2] = Long.MIN_VALUE;
}
Set<Long> set1 = LongStream.of(values).boxed().collect(Collectors.toSet());
Long[] longObjects = Arrays.stream(values).boxed().toArray(Long[]::new);
// Sort using compareUnsigned
Arrays.sort(longObjects, Long::compareUnsigned);

long[] arr = new long[values.length];
// Convert back to long[]
for (int i = 0; i < arr.length; i++) {
arr[i] = longObjects[i];
}
UnsignedLongHashSet set2 = new UnsignedLongHashSet(arr);
assertEquals(set1, set2);
}
}
}
Loading