Jelajahi Sumber

GlobalOrdCardinalityAggregator should use HyperLogLogPlusPlus instead of HyperLogLogPlusPlusSparse (#105546) (#105602)

Use the generic HyperLogLogPlusPlus on GlobalOrdCardinalityAggregator so we promote the algorihtm to HLL when 
we reach the linear counting threshold.
Ignacio Vera 1 tahun lalu
induk
melakukan
e1a68fa7fd

+ 6 - 0
docs/changelog/105546.yaml

@@ -0,0 +1,6 @@
+pr: 105546
+summary: '`GlobalOrdCardinalityAggregator` should use `HyperLogLogPlusPlus` instead
+  of `HyperLogLogPlusPlusSparse`'
+area: Aggregations
+type: bug
+issues: []

+ 2 - 3
server/src/main/java/org/elasticsearch/search/aggregations/metrics/GlobalOrdCardinalityAggregator.java

@@ -65,7 +65,7 @@ public class GlobalOrdCardinalityAggregator extends NumericMetricsAggregator.Sin
 
     // Build at post-collection phase
     @Nullable
-    private HyperLogLogPlusPlusSparse counts;
+    private HyperLogLogPlusPlus counts;
     private ObjectArray<BitArray> visitedOrds;
     private SortedSetDocValues values;
 
@@ -285,7 +285,7 @@ public class GlobalOrdCardinalityAggregator extends NumericMetricsAggregator.Sin
     }
 
     protected void doPostCollection() throws IOException {
-        counts = new HyperLogLogPlusPlusSparse(precision, bigArrays, visitedOrds.size());
+        counts = new HyperLogLogPlusPlus(precision, bigArrays, visitedOrds.size());
         try (LongArray hashes = bigArrays.newLongArray(maxOrd, false)) {
             try (BitArray allVisitedOrds = new BitArray(maxOrd, bigArrays)) {
                 for (long bucket = visitedOrds.size() - 1; bucket >= 0; --bucket) {
@@ -308,7 +308,6 @@ public class GlobalOrdCardinalityAggregator extends NumericMetricsAggregator.Sin
                 try (BitArray bits = visitedOrds.get(bucket)) {
                     if (bits != null) {
                         visitedOrds.set(bucket, null); // remove bitset from array
-                        counts.ensureCapacity(bucket, bits.cardinality());
                         for (long ord = bits.nextSetBit(0); ord < Long.MAX_VALUE; ord = ord + 1 < maxOrd
                             ? bits.nextSetBit(ord + 1)
                             : Long.MAX_VALUE) {

+ 32 - 0
server/src/test/java/org/elasticsearch/search/aggregations/metrics/CardinalityAggregatorTests.java

@@ -25,8 +25,10 @@ import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.tests.index.RandomIndexWriter;
 import org.apache.lucene.util.BytesRef;
 import org.elasticsearch.common.geo.GeoPoint;
+import org.elasticsearch.common.hash.MurmurHash3;
 import org.elasticsearch.common.network.InetAddresses;
 import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.util.BigArrays;
 import org.elasticsearch.core.CheckedConsumer;
 import org.elasticsearch.index.fielddata.ScriptDocValues;
 import org.elasticsearch.index.mapper.IpFieldMapper;
@@ -290,6 +292,36 @@ public class CardinalityAggregatorTests extends AggregatorTestCase {
         );
     }
 
+    public void testGlobalOrdinals() throws IOException {
+        // aggregation with minimum precision so we don't need to generate too many distinct values
+        final CardinalityAggregationBuilder aggregationBuilder = new CardinalityAggregationBuilder("name").field("str_value")
+            .precisionThreshold(1);
+        final MappedFieldType mappedFieldTypes = new KeywordFieldMapper.KeywordFieldType("str_value");
+
+        // range big enough that will force force promotion to HHL the time to time
+        final BytesRef[] differentValues = new BytesRef[randomIntBetween(10, 30)];
+        for (int i = 0; i < differentValues.length; i++) {
+            differentValues[i] = new BytesRef(randomAlphaOfLength(8));
+        }
+        // large number of documents to force global ordinals
+        final int numDocs = randomIntBetween(1000, 10000);
+        final HyperLogLogPlusPlus hll = new HyperLogLogPlusPlus(HyperLogLogPlusPlus.MIN_PRECISION, BigArrays.NON_RECYCLING_INSTANCE, 1);
+        final MurmurHash3.Hash128 hash = new MurmurHash3.Hash128();
+        final CheckedConsumer<RandomIndexWriter, IOException> buildIndex = iw -> {
+            for (int i = 0; i < numDocs; i++) {
+                final BytesRef value = differentValues[i % differentValues.length];
+                MurmurHash3.hash128(value.bytes, value.offset, value.length, 0, hash);
+                hll.collect(0, hash.h1);
+                iw.addDocument(List.of(new SortedDocValuesField("str_value", value)));
+            }
+        };
+
+        testAggregation(aggregationBuilder, new MatchAllDocsQuery(), buildIndex, card -> {
+            assertEquals(hll.cardinality(0), card.getValue(), 0);
+            assertTrue(AggregationInspectionHelper.hasValue(card));
+        }, mappedFieldTypes);
+    }
+
     public void testIndexedSingleValuedIP() throws IOException {
         // IP addresses are interesting to test because they use sorted doc values like keywords, but index data using points rather than an
         // inverted index, so this triggers a different code path to disable dynamic pruning