Browse Source

Use FallbackSyntheticSourceBlockLoader for text fields (#126237)

Oleksandr Kolomiiets 6 months ago
parent
commit
21ff72bef4

+ 5 - 0
docs/changelog/126237.yaml

@@ -0,0 +1,5 @@
+pr: 126237
+summary: Use `FallbackSyntheticSourceBlockLoader` for text fields
+area: Mapping
+type: enhancement
+issues: []

+ 44 - 0
server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java

@@ -74,6 +74,7 @@ import org.elasticsearch.script.field.TextDocValuesField;
 import org.elasticsearch.search.aggregations.support.CoreValuesSourceType;
 import org.elasticsearch.search.aggregations.support.CoreValuesSourceType;
 import org.elasticsearch.xcontent.ToXContent;
 import org.elasticsearch.xcontent.ToXContent;
 import org.elasticsearch.xcontent.XContentBuilder;
 import org.elasticsearch.xcontent.XContentBuilder;
+import org.elasticsearch.xcontent.XContentParser;
 
 
 import java.io.IOException;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.ArrayList;
@@ -1016,10 +1017,53 @@ public final class TextFieldMapper extends FieldMapper {
             if (isStored()) {
             if (isStored()) {
                 return new BlockStoredFieldsReader.BytesFromStringsBlockLoader(name());
                 return new BlockStoredFieldsReader.BytesFromStringsBlockLoader(name());
             }
             }
+
+            // _ignored_source field will only be present if text field is not stored
+            // and there is no syntheticSourceDelegate
+            if (isSyntheticSource && syntheticSourceDelegate == null) {
+                return fallbackSyntheticSourceBlockLoader();
+            }
+
             SourceValueFetcher fetcher = SourceValueFetcher.toString(blContext.sourcePaths(name()));
             SourceValueFetcher fetcher = SourceValueFetcher.toString(blContext.sourcePaths(name()));
             return new BlockSourceReader.BytesRefsBlockLoader(fetcher, blockReaderDisiLookup(blContext));
             return new BlockSourceReader.BytesRefsBlockLoader(fetcher, blockReaderDisiLookup(blContext));
         }
         }
 
 
+        FallbackSyntheticSourceBlockLoader fallbackSyntheticSourceBlockLoader() {
+            var reader = new FallbackSyntheticSourceBlockLoader.SingleValueReader<BytesRef>(null) {
+                @Override
+                public void convertValue(Object value, List<BytesRef> accumulator) {
+                    if (value != null) {
+                        accumulator.add(new BytesRef(value.toString()));
+                    }
+                }
+
+                @Override
+                protected void parseNonNullValue(XContentParser parser, List<BytesRef> accumulator) throws IOException {
+                    var text = parser.textOrNull();
+
+                    if (text != null) {
+                        accumulator.add(new BytesRef(text));
+                    }
+                }
+
+                @Override
+                public void writeToBlock(List<BytesRef> values, BlockLoader.Builder blockBuilder) {
+                    var bytesRefBuilder = (BlockLoader.BytesRefBuilder) blockBuilder;
+
+                    for (var value : values) {
+                        bytesRefBuilder.appendBytesRef(value);
+                    }
+                }
+            };
+
+            return new FallbackSyntheticSourceBlockLoader(reader, name()) {
+                @Override
+                public Builder builder(BlockFactory factory, int expectedCount) {
+                    return factory.bytesRefs(expectedCount);
+                }
+            };
+        }
+
         /**
         /**
          * Build an iterator of documents that have the field. This mirrors parseCreateField,
          * Build an iterator of documents that have the field. This mirrors parseCreateField,
          * using whatever
          * using whatever

+ 7 - 3
server/src/test/java/org/elasticsearch/index/mapper/blockloader/KeywordFieldBlockLoaderTests.java

@@ -25,9 +25,13 @@ public class KeywordFieldBlockLoaderTests extends BlockLoaderTestCase {
         super(FieldType.KEYWORD.toString(), params);
         super(FieldType.KEYWORD.toString(), params);
     }
     }
 
 
-    @SuppressWarnings("unchecked")
     @Override
     @Override
     protected Object expected(Map<String, Object> fieldMapping, Object value, TestContext testContext) {
     protected Object expected(Map<String, Object> fieldMapping, Object value, TestContext testContext) {
+        return expectedValue(fieldMapping, value, params, testContext);
+    }
+
+    @SuppressWarnings("unchecked")
+    public static Object expectedValue(Map<String, Object> fieldMapping, Object value, Params params, TestContext testContext) {
         var nullValue = (String) fieldMapping.get("null_value");
         var nullValue = (String) fieldMapping.get("null_value");
 
 
         var ignoreAbove = fieldMapping.get("ignore_above") == null
         var ignoreAbove = fieldMapping.get("ignore_above") == null
@@ -45,7 +49,7 @@ public class KeywordFieldBlockLoaderTests extends BlockLoaderTestCase {
         Function<Stream<String>, Stream<BytesRef>> convertValues = s -> s.map(v -> convert(v, nullValue, ignoreAbove))
         Function<Stream<String>, Stream<BytesRef>> convertValues = s -> s.map(v -> convert(v, nullValue, ignoreAbove))
             .filter(Objects::nonNull);
             .filter(Objects::nonNull);
 
 
-        boolean hasDocValues = hasDocValues(fieldMapping, false);
+        boolean hasDocValues = hasDocValues(fieldMapping, true);
         boolean useDocValues = params.preference() == MappedFieldType.FieldExtractPreference.NONE
         boolean useDocValues = params.preference() == MappedFieldType.FieldExtractPreference.NONE
             || params.preference() == MappedFieldType.FieldExtractPreference.DOC_VALUES
             || params.preference() == MappedFieldType.FieldExtractPreference.DOC_VALUES
             || params.syntheticSource();
             || params.syntheticSource();
@@ -63,7 +67,7 @@ public class KeywordFieldBlockLoaderTests extends BlockLoaderTestCase {
         return maybeFoldList(resultList);
         return maybeFoldList(resultList);
     }
     }
 
 
-    private BytesRef convert(String value, String nullValue, int ignoreAbove) {
+    private static BytesRef convert(String value, String nullValue, int ignoreAbove) {
         if (value == null) {
         if (value == null) {
             if (nullValue != null) {
             if (nullValue != null) {
                 value = nullValue;
                 value = nullValue;

+ 135 - 0
server/src/test/java/org/elasticsearch/index/mapper/blockloader/TextFieldBlockLoaderTests.java

@@ -0,0 +1,135 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+package org.elasticsearch.index.mapper.blockloader;
+
+import org.apache.lucene.util.BytesRef;
+import org.elasticsearch.index.mapper.BlockLoaderTestCase;
+import org.elasticsearch.logsdb.datageneration.FieldType;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.stream.Collectors;
+
+public class TextFieldBlockLoaderTests extends BlockLoaderTestCase {
+    public TextFieldBlockLoaderTests(Params params) {
+        super(FieldType.TEXT.toString(), params);
+    }
+
+    @SuppressWarnings("unchecked")
+    @Override
+    protected Object expected(Map<String, Object> fieldMapping, Object value, TestContext testContext) {
+        if (fieldMapping.getOrDefault("store", false).equals(true)) {
+            return valuesInSourceOrder(value);
+        }
+
+        var fields = (Map<String, Object>) fieldMapping.get("fields");
+        if (fields != null) {
+            var keywordMultiFieldMapping = (Map<String, Object>) fields.get("kwd");
+            boolean docValues = hasDocValues(keywordMultiFieldMapping, true);
+            boolean index = keywordMultiFieldMapping.getOrDefault("index", true).equals(true);
+            boolean store = keywordMultiFieldMapping.getOrDefault("store", false).equals(true);
+            Object ignoreAbove = keywordMultiFieldMapping.get("ignore_above");
+
+            // See TextFieldMapper.SyntheticSourceHelper#usingSyntheticSourceDelegate
+            // and TextFieldMapper#canUseSyntheticSourceDelegateForLoading().
+            boolean usingSyntheticSourceDelegate = docValues || store;
+            boolean canUseSyntheticSourceDelegateForLoading = usingSyntheticSourceDelegate && ignoreAbove == null && (index || store);
+            if (canUseSyntheticSourceDelegateForLoading) {
+                return KeywordFieldBlockLoaderTests.expectedValue(keywordMultiFieldMapping, value, params, testContext);
+            }
+
+            // Even if multi-field is not eligible for loading it can still be used to produce synthetic source
+            // and then we load from the synthetic source.
+            // Synthetic source is actually different from keyword field block loader results
+            // because synthetic source includes values exceeding ignore_above and block loader doesn't.
+            // TODO ideally this logic should be in some kind of KeywordFieldSyntheticSourceTest that uses same infra as
+            // KeywordFieldBlockLoaderTest
+            // It is here since KeywordFieldBlockLoaderTest does not really need it
+            if (params.syntheticSource() && testContext.forceFallbackSyntheticSource() == false && usingSyntheticSourceDelegate) {
+                var nullValue = (String) keywordMultiFieldMapping.get("null_value");
+
+                // Due to how TextFieldMapper#blockReaderDisiLookup works this is complicated.
+                // If we are using lookupMatchingAll() then we'll see all docs, generate synthetic source using syntheticSourceDelegate,
+                // parse it and see null_value inside.
+                // But if we are using lookupFromNorms() we will skip the document (since the text field itself does not exist).
+                // Same goes for lookupFromFieldNames().
+                boolean textFieldIndexed = (boolean) fieldMapping.getOrDefault("index", true);
+
+                if (value == null) {
+                    if (textFieldIndexed == false
+                        && nullValue != null
+                        && (ignoreAbove == null || nullValue.length() <= (int) ignoreAbove)) {
+                        return new BytesRef(nullValue);
+                    }
+
+                    return null;
+                }
+
+                if (value instanceof String s) {
+                    return new BytesRef(s);
+                }
+
+                var values = (List<String>) value;
+
+                // See note above about TextFieldMapper#blockReaderDisiLookup.
+                if (textFieldIndexed && values.stream().allMatch(Objects::isNull)) {
+                    return null;
+                }
+
+                var indexed = values.stream()
+                    .map(s -> s == null ? nullValue : s)
+                    .filter(Objects::nonNull)
+                    .filter(s -> ignoreAbove == null || s.length() <= (int) ignoreAbove)
+                    .map(BytesRef::new)
+                    .collect(Collectors.toList());
+
+                if (store == false) {
+                    // using doc_values for synthetic source
+                    indexed = new ArrayList<>(new HashSet<>(indexed));
+                    indexed.sort(BytesRef::compareTo);
+                }
+
+                // ignored values always come last
+                List<BytesRef> ignored = ignoreAbove == null
+                    ? List.of()
+                    : values.stream()
+                        .map(s -> s == null ? nullValue : s)
+                        .filter(Objects::nonNull)
+                        .filter(s -> s.length() > (int) ignoreAbove)
+                        .map(BytesRef::new)
+                        .toList();
+
+                indexed.addAll(ignored);
+
+                return maybeFoldList(indexed);
+            }
+        }
+
+        // Loading from _ignored_source or stored _source
+        return valuesInSourceOrder(value);
+    }
+
+    @SuppressWarnings("unchecked")
+    private Object valuesInSourceOrder(Object value) {
+        if (value == null) {
+            return null;
+        }
+
+        if (value instanceof String s) {
+            return new BytesRef(s);
+        }
+
+        var resultList = ((List<String>) value).stream().filter(Objects::nonNull).map(BytesRef::new).toList();
+        return maybeFoldList(resultList);
+    }
+}

+ 5 - 1
test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/FieldType.java

@@ -23,6 +23,7 @@ import org.elasticsearch.logsdb.datageneration.fields.leaf.KeywordFieldDataGener
 import org.elasticsearch.logsdb.datageneration.fields.leaf.LongFieldDataGenerator;
 import org.elasticsearch.logsdb.datageneration.fields.leaf.LongFieldDataGenerator;
 import org.elasticsearch.logsdb.datageneration.fields.leaf.ScaledFloatFieldDataGenerator;
 import org.elasticsearch.logsdb.datageneration.fields.leaf.ScaledFloatFieldDataGenerator;
 import org.elasticsearch.logsdb.datageneration.fields.leaf.ShortFieldDataGenerator;
 import org.elasticsearch.logsdb.datageneration.fields.leaf.ShortFieldDataGenerator;
+import org.elasticsearch.logsdb.datageneration.fields.leaf.TextFieldDataGenerator;
 import org.elasticsearch.logsdb.datageneration.fields.leaf.UnsignedLongFieldDataGenerator;
 import org.elasticsearch.logsdb.datageneration.fields.leaf.UnsignedLongFieldDataGenerator;
 
 
 /**
 /**
@@ -42,7 +43,8 @@ public enum FieldType {
     COUNTED_KEYWORD("counted_keyword"),
     COUNTED_KEYWORD("counted_keyword"),
     BOOLEAN("boolean"),
     BOOLEAN("boolean"),
     DATE("date"),
     DATE("date"),
-    GEO_POINT("geo_point");
+    GEO_POINT("geo_point"),
+    TEXT("text");
 
 
     private final String name;
     private final String name;
 
 
@@ -66,6 +68,7 @@ public enum FieldType {
             case BOOLEAN -> new BooleanFieldDataGenerator(dataSource);
             case BOOLEAN -> new BooleanFieldDataGenerator(dataSource);
             case DATE -> new DateFieldDataGenerator(dataSource);
             case DATE -> new DateFieldDataGenerator(dataSource);
             case GEO_POINT -> new GeoPointFieldDataGenerator(dataSource);
             case GEO_POINT -> new GeoPointFieldDataGenerator(dataSource);
+            case TEXT -> new TextFieldDataGenerator(dataSource);
         };
         };
     }
     }
 
 
@@ -85,6 +88,7 @@ public enum FieldType {
             case "boolean" -> FieldType.BOOLEAN;
             case "boolean" -> FieldType.BOOLEAN;
             case "date" -> FieldType.DATE;
             case "date" -> FieldType.DATE;
             case "geo_point" -> FieldType.GEO_POINT;
             case "geo_point" -> FieldType.GEO_POINT;
+            case "text" -> FieldType.TEXT;
             default -> null;
             default -> null;
         };
         };
     }
     }

+ 31 - 4
test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/datasource/DefaultMappingParametersHandler.java

@@ -35,10 +35,7 @@ public class DefaultMappingParametersHandler implements DataSourceHandler {
             return null;
             return null;
         }
         }
 
 
-        var map = new HashMap<String, Object>();
-        map.put("store", ESTestCase.randomBoolean());
-        map.put("index", ESTestCase.randomBoolean());
-        map.put("doc_values", ESTestCase.randomBoolean());
+        var map = commonMappingParameters();
         if (ESTestCase.randomBoolean()) {
         if (ESTestCase.randomBoolean()) {
             map.put(Mapper.SYNTHETIC_SOURCE_KEEP_PARAM, ESTestCase.randomFrom("none", "arrays", "all"));
             map.put(Mapper.SYNTHETIC_SOURCE_KEEP_PARAM, ESTestCase.randomFrom("none", "arrays", "all"));
         }
         }
@@ -51,6 +48,7 @@ public class DefaultMappingParametersHandler implements DataSourceHandler {
             case BOOLEAN -> booleanMapping(map);
             case BOOLEAN -> booleanMapping(map);
             case DATE -> dateMapping(map);
             case DATE -> dateMapping(map);
             case GEO_POINT -> geoPointMapping(map);
             case GEO_POINT -> geoPointMapping(map);
+            case TEXT -> textMapping(request, new HashMap<>());
         });
         });
     }
     }
 
 
@@ -190,6 +188,35 @@ public class DefaultMappingParametersHandler implements DataSourceHandler {
         };
         };
     }
     }
 
 
+    private Supplier<Map<String, Object>> textMapping(
+        DataSourceRequest.LeafMappingParametersGenerator request,
+        Map<String, Object> injected
+    ) {
+        return () -> {
+            injected.put("store", ESTestCase.randomBoolean());
+            injected.put("index", ESTestCase.randomBoolean());
+
+            if (ESTestCase.randomDouble() <= 0.1) {
+                var keywordMultiFieldMapping = keywordMapping(request, commonMappingParameters()).get();
+                keywordMultiFieldMapping.put("type", "keyword");
+                keywordMultiFieldMapping.remove("copy_to");
+
+                injected.put("fields", Map.of("kwd", keywordMultiFieldMapping));
+
+            }
+
+            return injected;
+        };
+    }
+
+    private static HashMap<String, Object> commonMappingParameters() {
+        var map = new HashMap<String, Object>();
+        map.put("store", ESTestCase.randomBoolean());
+        map.put("index", ESTestCase.randomBoolean());
+        map.put("doc_values", ESTestCase.randomBoolean());
+        return map;
+    }
+
     @Override
     @Override
     public DataSourceResponse.ObjectMappingParametersGenerator handle(DataSourceRequest.ObjectMappingParametersGenerator request) {
     public DataSourceResponse.ObjectMappingParametersGenerator handle(DataSourceRequest.ObjectMappingParametersGenerator request) {
         if (request.isNested()) {
         if (request.isNested()) {

+ 34 - 0
test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/fields/leaf/TextFieldDataGenerator.java

@@ -0,0 +1,34 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+package org.elasticsearch.logsdb.datageneration.fields.leaf;
+
+import org.elasticsearch.logsdb.datageneration.FieldDataGenerator;
+import org.elasticsearch.logsdb.datageneration.datasource.DataSource;
+import org.elasticsearch.logsdb.datageneration.datasource.DataSourceRequest;
+
+import java.util.Map;
+import java.util.function.Supplier;
+
+public class TextFieldDataGenerator implements FieldDataGenerator {
+    private final Supplier<Object> valueGenerator;
+
+    public TextFieldDataGenerator(DataSource dataSource) {
+        var strings = dataSource.get(new DataSourceRequest.StringGenerator());
+        var nulls = dataSource.get(new DataSourceRequest.NullWrapper());
+        var arrays = dataSource.get(new DataSourceRequest.ArrayWrapper());
+
+        this.valueGenerator = arrays.wrapper().compose(nulls.wrapper()).apply(() -> strings.generator().get());
+    }
+
+    @Override
+    public Object generateValue(Map<String, Object> fieldMapping) {
+        return valueGenerator.get();
+    }
+}

+ 68 - 0
test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/matchers/source/FieldSpecificMatcher.java

@@ -61,6 +61,7 @@ interface FieldSpecificMatcher {
                 put("geo_shape", new ExactMatcher("geo_shape", actualMappings, actualSettings, expectedMappings, expectedSettings));
                 put("geo_shape", new ExactMatcher("geo_shape", actualMappings, actualSettings, expectedMappings, expectedSettings));
                 put("shape", new ExactMatcher("shape", actualMappings, actualSettings, expectedMappings, expectedSettings));
                 put("shape", new ExactMatcher("shape", actualMappings, actualSettings, expectedMappings, expectedSettings));
                 put("geo_point", new GeoPointMatcher(actualMappings, actualSettings, expectedMappings, expectedSettings));
                 put("geo_point", new GeoPointMatcher(actualMappings, actualSettings, expectedMappings, expectedSettings));
+                put("text", new TextMatcher(actualMappings, actualSettings, expectedMappings, expectedSettings));
             }
             }
         };
         };
     }
     }
@@ -598,6 +599,73 @@ interface FieldSpecificMatcher {
         }
         }
     }
     }
 
 
+    class TextMatcher implements FieldSpecificMatcher {
+        private final XContentBuilder actualMappings;
+        private final Settings.Builder actualSettings;
+        private final XContentBuilder expectedMappings;
+        private final Settings.Builder expectedSettings;
+
+        TextMatcher(
+            XContentBuilder actualMappings,
+            Settings.Builder actualSettings,
+            XContentBuilder expectedMappings,
+            Settings.Builder expectedSettings
+        ) {
+            this.actualMappings = actualMappings;
+            this.actualSettings = actualSettings;
+            this.expectedMappings = expectedMappings;
+            this.expectedSettings = expectedSettings;
+        }
+
+        @Override
+        @SuppressWarnings("unchecked")
+        public MatchResult match(
+            List<Object> actual,
+            List<Object> expected,
+            Map<String, Object> actualMapping,
+            Map<String, Object> expectedMapping
+        ) {
+            var expectedNormalized = normalize(expected);
+            var actualNormalized = normalize(actual);
+
+            // Match simply as text first.
+            if (actualNormalized.equals(expectedNormalized)) {
+                return MatchResult.match();
+            }
+
+            // In some cases synthetic source for text fields is synthesized using the keyword multi field.
+            // So in this case it's appropriate to match it using keyword matching logic (mainly to cover `null_value`).
+            var multiFields = (Map<String, Object>) getMappingParameter("fields", actualMapping, expectedMapping);
+            if (multiFields != null) {
+                var keywordMatcher = new KeywordMatcher(actualMappings, actualSettings, expectedMappings, expectedSettings);
+
+                var keywordFieldMapping = (Map<String, Object>) multiFields.get("kwd");
+                var keywordMatchResult = keywordMatcher.match(actual, expected, keywordFieldMapping, keywordFieldMapping);
+                if (keywordMatchResult.isMatch()) {
+                    return MatchResult.match();
+                }
+            }
+
+            return MatchResult.noMatch(
+                formatErrorMessage(
+                    actualMappings,
+                    actualSettings,
+                    expectedMappings,
+                    expectedSettings,
+                    "Values of type [text] don't match, " + prettyPrintCollections(actual, expected)
+                )
+            );
+        }
+
+        private Set<Object> normalize(List<Object> values) {
+            if (values == null) {
+                return Set.of();
+            }
+
+            return values.stream().filter(Objects::nonNull).collect(Collectors.toSet());
+        }
+    }
+
     /**
     /**
      * Generic matcher that supports common matching logic like null values.
      * Generic matcher that supports common matching logic like null values.
      */
      */

+ 1 - 1
test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/matchers/source/MappingTransforms.java

@@ -46,7 +46,7 @@ class MappingTransforms {
             if (entry.getKey().equals("_doc") || entry.getKey().equals("properties")) {
             if (entry.getKey().equals("_doc") || entry.getKey().equals("properties")) {
                 descend(pathFromRoot, (Map<String, Object>) entry.getValue(), flattened);
                 descend(pathFromRoot, (Map<String, Object>) entry.getValue(), flattened);
             } else {
             } else {
-                if (entry.getValue() instanceof Map<?, ?> map) {
+                if (entry.getKey().equals("fields") == false && entry.getValue() instanceof Map<?, ?> map) {
                     var pathToField = pathFromRoot == null ? entry.getKey() : pathFromRoot + "." + entry.getKey();
                     var pathToField = pathFromRoot == null ? entry.getKey() : pathFromRoot + "." + entry.getKey();
 
 
                     // Descending to subobject, we need to remember parent mapping
                     // Descending to subobject, we need to remember parent mapping