浏览代码

Optimise frequent item sets aggregation for single value fields (#108130)

Similar to #107832, this commit optimize requent item sets aggregation for single value fields.
Ignacio Vera 1 年之前
父节点
当前提交
c89de11e57

+ 5 - 0
docs/changelog/108130.yaml

@@ -0,0 +1,5 @@
+pr: 108130
+summary: Optimise frequent item sets aggregation for single value fields
+area: Machine Learning
+type: enhancement
+issues: []

+ 71 - 30
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/frequentitemsets/mr/ItemSetMapReduceValueSource.java

@@ -9,7 +9,11 @@ package org.elasticsearch.xpack.ml.aggs.frequentitemsets.mr;
 
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.SortedDocValues;
 import org.apache.lucene.index.SortedNumericDocValues;
 import org.apache.lucene.index.SortedSetDocValues;
 import org.apache.lucene.util.BytesRef;
@@ -19,6 +23,7 @@ import org.elasticsearch.common.io.stream.StreamInput;
 import org.elasticsearch.common.io.stream.StreamOutput;
 import org.elasticsearch.common.io.stream.Writeable;
 import org.elasticsearch.core.Tuple;
+import org.elasticsearch.index.fielddata.FieldData;
 import org.elasticsearch.index.fielddata.SortedBinaryDocValues;
 import org.elasticsearch.search.DocValueFormat;
 import org.elasticsearch.search.aggregations.bucket.terms.IncludeExclude;
@@ -204,15 +209,19 @@ public abstract class ItemSetMapReduceValueSource {
             @Override
             public ValueCollector getValueCollector(LeafReaderContext ctx) throws IOException {
                 this.docValues = source.globalOrdinalsValues(ctx);
-                ;
-                final Tuple<Field, List<Object>> empty = new Tuple<>(field, Collections.emptyList());
                 final SortedSetDocValues values = this.docValues;
+                final SortedDocValues singleton = DocValues.unwrapSingleton(values);
+                final Tuple<Field, List<Object>> empty = new Tuple<>(field, Collections.emptyList());
+                return singleton != null ? getValueCollector(singleton, empty) : getValueCollector(values, empty);
+            }
+
+            private ValueCollector getValueCollector(SortedSetDocValues values, Tuple<Field, List<Object>> empty) {
                 return doc -> {
                     if (values.advanceExact(doc)) {
-                        int valuesCount = values.docValueCount();
+                        final int valuesCount = values.docValueCount();
 
                         if (valuesCount == 1) {
-                            long v = values.nextOrd();
+                            final long v = values.nextOrd();
                             assert v >= 0;
                             if (bitSetFilter == null || bitSetFilter.get(v)) {
                                 return new Tuple<>(field, Collections.singletonList(v));
@@ -220,14 +229,9 @@ public abstract class ItemSetMapReduceValueSource {
                             return empty;
                         }
 
-                        if (valuesCount == 0) {
-                            return empty;
-                        }
-
-                        List<Object> objects = new ArrayList<>(valuesCount);
-
+                        final List<Object> objects = new ArrayList<>(valuesCount);
                         for (int i = 0; i < valuesCount; ++i) {
-                            long v = values.nextOrd();
+                            final long v = values.nextOrd();
                             assert v >= 0;
                             if (bitSetFilter == null || bitSetFilter.get(v)) {
                                 objects.add(v);
@@ -239,6 +243,20 @@ public abstract class ItemSetMapReduceValueSource {
                 };
             }
 
+            private ValueCollector getValueCollector(SortedDocValues values, Tuple<Field, List<Object>> empty) {
+                return doc -> {
+                    if (values.advanceExact(doc)) {
+                        final long v = values.ordValue();
+                        assert v >= 0;
+                        if (bitSetFilter == null || bitSetFilter.get(v)) {
+                            return new Tuple<>(field, Collections.singletonList(v));
+                        }
+                        return empty;
+                    }
+                    return empty;
+                };
+            }
+
             @Override
             public boolean usesOrdinals() {
                 return true;
@@ -265,28 +283,27 @@ public abstract class ItemSetMapReduceValueSource {
             @Override
             public ValueCollector getValueCollector(LeafReaderContext ctx) throws IOException {
                 final SortedBinaryDocValues values = source.bytesValues(ctx);
+                final BinaryDocValues singleton = FieldData.unwrapSingleton(values);
                 final Tuple<Field, List<Object>> empty = new Tuple<>(field, Collections.emptyList());
+                return singleton != null ? getValueCollector(singleton, empty) : getValueCollector(values, empty);
+            }
 
+            private ValueCollector getValueCollector(SortedBinaryDocValues values, Tuple<Field, List<Object>> empty) {
                 return doc -> {
                     if (values.advanceExact(doc)) {
-                        int valuesCount = values.docValueCount();
+                        final int valuesCount = values.docValueCount();
 
                         if (valuesCount == 1) {
-                            BytesRef v = values.nextValue();
+                            final BytesRef v = values.nextValue();
                             if (stringFilter == null || stringFilter.accept(v)) {
                                 return new Tuple<>(field, Collections.singletonList(BytesRef.deepCopyOf(v)));
                             }
                             return empty;
                         }
 
-                        if (valuesCount == 0) {
-                            return empty;
-                        }
-
-                        List<Object> objects = new ArrayList<>(valuesCount);
-
+                        final List<Object> objects = new ArrayList<>(valuesCount);
                         for (int i = 0; i < valuesCount; ++i) {
-                            BytesRef v = values.nextValue();
+                            final BytesRef v = values.nextValue();
                             if (stringFilter == null || stringFilter.accept(v)) {
                                 objects.add(BytesRef.deepCopyOf(v));
                             }
@@ -297,6 +314,19 @@ public abstract class ItemSetMapReduceValueSource {
                 };
             }
 
+            private ValueCollector getValueCollector(BinaryDocValues values, Tuple<Field, List<Object>> empty) {
+                return doc -> {
+                    if (values.advanceExact(doc)) {
+                        final BytesRef v = values.binaryValue();
+                        if (stringFilter == null || stringFilter.accept(v)) {
+                            return new Tuple<>(field, Collections.singletonList(BytesRef.deepCopyOf(v)));
+                        }
+                        return empty;
+                    }
+                    return empty;
+                };
+            }
+
             @Override
             public boolean usesOrdinals() {
                 return false;
@@ -374,29 +404,28 @@ public abstract class ItemSetMapReduceValueSource {
         @Override
         ValueCollector getValueCollector(LeafReaderContext ctx) throws IOException {
             final SortedNumericDocValues values = source.longValues(ctx);
+            final NumericDocValues singleton = DocValues.unwrapSingleton(values);
             final Field field = getField();
             final Tuple<Field, List<Object>> empty = new Tuple<>(field, Collections.emptyList());
+            return singleton != null ? getValueCollector(singleton, empty, field) : getValueCollector(values, empty, field);
+        }
 
+        private ValueCollector getValueCollector(SortedNumericDocValues values, Tuple<Field, List<Object>> empty, Field field) {
             return doc -> {
                 if (values.advanceExact(doc)) {
-                    int valuesCount = values.docValueCount();
+                    final int valuesCount = values.docValueCount();
 
                     if (valuesCount == 1) {
-                        long v = values.nextValue();
+                        final long v = values.nextValue();
                         if (longFilter == null || longFilter.accept(v)) {
-                            return new Tuple<>(getField(), Collections.singletonList(v));
+                            return new Tuple<>(field, Collections.singletonList(v));
                         }
                         return empty;
                     }
 
-                    if (valuesCount == 0) {
-                        return empty;
-                    }
-
-                    List<Object> objects = new ArrayList<>(valuesCount);
-
+                    final List<Object> objects = new ArrayList<>(valuesCount);
                     for (int i = 0; i < valuesCount; ++i) {
-                        long v = values.nextValue();
+                        final long v = values.nextValue();
                         if (longFilter == null || longFilter.accept(v)) {
                             objects.add(v);
                         }
@@ -407,5 +436,17 @@ public abstract class ItemSetMapReduceValueSource {
             };
         }
 
+        private ValueCollector getValueCollector(NumericDocValues values, Tuple<Field, List<Object>> empty, Field field) {
+            return doc -> {
+                if (values.advanceExact(doc)) {
+                    final long v = values.longValue();
+                    if (longFilter == null || longFilter.accept(v)) {
+                        return new Tuple<>(field, Collections.singletonList(v));
+                    }
+                    return empty;
+                }
+                return empty;
+            };
+        }
     }
 }