Browse Source

Text field block loader properly handles null values from delegate (#127525)

Oleksandr Kolomiiets 5 months ago
parent
commit
0df9d1c4c2

+ 36 - 0
docs/reference/elasticsearch/mapping-reference/keyword.md

@@ -232,6 +232,42 @@ Will become:
 }
 ```
 
+If `null_value` is configured, `null` values are replaced with the `null_value` in synthetic source:
+
+$$$synthetic-source-keyword-example-null-value$$$
+
+```console
+PUT idx
+{
+  "settings": {
+    "index": {
+      "mapping": {
+        "source": {
+          "mode": "synthetic"
+        }
+      }
+    }
+  },
+  "mappings": {
+    "properties": {
+      "kwd": { "type": "keyword", "null_value": "NA" }
+    }
+  }
+}
+PUT idx/_doc/1
+{
+  "kwd": ["foo", null, "bar"]
+}
+```
+
+Will become:
+
+```console-result
+{
+  "kwd": ["bar", "foo", "NA"]
+}
+```
+
 
 ## Constant keyword field type [constant-keyword-field-type]
 

+ 26 - 13
docs/reference/elasticsearch/mapping-reference/text.md

@@ -104,11 +104,17 @@ Synthetic `_source` is Generally Available only for TSDB indices (indices that h
 ::::
 
 
-`text` fields support [synthetic `_source`](/reference/elasticsearch/mapping-reference/mapping-source-field.md#synthetic-source) if they have a [`keyword`](/reference/elasticsearch/mapping-reference/keyword.md#keyword-synthetic-source) sub-field that supports synthetic `_source` or if the `text` field sets `store` to `true`. Either way, it may not have [`copy_to`](/reference/elasticsearch/mapping-reference/copy-to.md).
+`text` fields can use a [`keyword`](/reference/elasticsearch/mapping-reference/keyword.md#keyword-synthetic-source) sub-field to support [synthetic `_source`](/reference/elasticsearch/mapping-reference/mapping-source-field.md#synthetic-source) without storing values of the text field itself.
 
-If using a sub-`keyword` field, then the values are sorted in the same way as a `keyword` field’s values are sorted. By default, that means sorted with duplicates removed. So:
+In this case, the synthetic source of the `text` field will have the same [modifications](/reference/elasticsearch/mapping-reference/mapping-source-field.md#synthetic-source) as a `keyword` field.
 
-$$$synthetic-source-text-example-default$$$
+These modifications can impact usage of `text` fields:
+* Reordering text fields can have an effect on [phrase](/reference/query-languages/query-dsl/query-dsl-match-query-phrase.md) and [span](/reference/query-languages/query-dsl/span-queries.md) queries. See the discussion about [`position_increment_gap`](/reference/elasticsearch/mapping-reference/position-increment-gap.md) for more details. You can avoid this by making sure the `slop` parameter on the phrase queries is lower than the `position_increment_gap`. This is the default.
+* Handling of `null` values is different. `text` fields ignore `null` values, but `keyword` fields support replacing `null` values with a value specified in the `null_value` parameter. This replacement is represented in synthetic source.
+
+For example:
+
+$$$synthetic-source-text-example-multi-field$$$
 
 ```console
 PUT idx
@@ -127,8 +133,9 @@ PUT idx
       "text": {
         "type": "text",
         "fields": {
-          "raw": {
-            "type": "keyword"
+          "kwd": {
+            "type": "keyword",
+            "null_value": "NA"
           }
         }
       }
@@ -138,9 +145,10 @@ PUT idx
 PUT idx/_doc/1
 {
   "text": [
+    null,
     "the quick brown fox",
     "the quick brown fox",
-    "jumped over the lazy dog"
+    "jumped over the lazy dog",
   ]
 }
 ```
@@ -150,18 +158,15 @@ Will become:
 ```console-result
 {
   "text": [
-    "jumped over the lazy dog",
+    "jumped over the lazy dog"
+    "NA",
     "the quick brown fox"
   ]
 }
 ```
 
-::::{note}
-Reordering text fields can have an effect on [phrase](/reference/query-languages/query-dsl/query-dsl-match-query-phrase.md) and [span](/reference/query-languages/query-dsl/span-queries.md) queries. See the discussion about [`position_increment_gap`](/reference/elasticsearch/mapping-reference/position-increment-gap.md) for more detail. You can avoid this by making sure the `slop` parameter on the phrase queries is lower than the `position_increment_gap`. This is the default.
-::::
 
-
-If the `text` field sets `store` to true then order and duplicates are preserved.
+If the `text` field sets `store` to `true` then the sub-field is not used and no modifications are applied. For example:
 
 $$$synthetic-source-text-example-stored$$$
 
@@ -179,7 +184,15 @@ PUT idx
   },
   "mappings": {
     "properties": {
-      "text": { "type": "text", "store": true }
+      "text": {
+        "type": "text",
+        "store": true,
+        "fields": {
+          "raw": {
+            "type": "keyword"
+          }
+        }
+      }
     }
   }
 }

+ 7 - 0
server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java

@@ -1087,6 +1087,13 @@ public final class TextFieldMapper extends FieldMapper {
          * using whatever
          */
         private BlockSourceReader.LeafIteratorLookup blockReaderDisiLookup(BlockLoaderContext blContext) {
+            if (isSyntheticSource && syntheticSourceDelegate != null) {
+                // Since we are using synthetic source and a delegate, we can't use this field
+                // to determine if the delegate has values in the document (f.e. handling of `null` is different
+                // between text and keyword).
+                return BlockSourceReader.lookupMatchingAll();
+            }
+
             if (isIndexed()) {
                 if (getTextSearchInfo().hasNorms()) {
                     return BlockSourceReader.lookupFromNorms(name());

+ 1 - 14
server/src/test/java/org/elasticsearch/index/mapper/blockloader/TextFieldBlockLoaderTests.java

@@ -62,15 +62,8 @@ public class TextFieldBlockLoaderTests extends BlockLoaderTestCase {
             if (params.syntheticSource() && testContext.forceFallbackSyntheticSource() == false && usingSyntheticSourceDelegate) {
                 var nullValue = (String) keywordMultiFieldMapping.get("null_value");
 
-                // Due to how TextFieldMapper#blockReaderDisiLookup works this is complicated.
-                // If we are using lookupMatchingAll() then we'll see all docs, generate synthetic source using syntheticSourceDelegate,
-                // parse it and see null_value inside.
-                // But if we are using lookupFromNorms() we will skip the document (since the text field itself does not exist).
-                // Same goes for lookupFromFieldNames().
-                boolean textFieldIndexed = (boolean) fieldMapping.getOrDefault("index", true);
-
                 if (value == null) {
-                    if (textFieldIndexed == false && nullValue != null && nullValue.length() <= (int) ignoreAbove) {
+                    if (nullValue != null && nullValue.length() <= (int) ignoreAbove) {
                         return new BytesRef(nullValue);
                     }
 
@@ -82,12 +75,6 @@ public class TextFieldBlockLoaderTests extends BlockLoaderTestCase {
                 }
 
                 var values = (List<String>) value;
-
-                // See note above about TextFieldMapper#blockReaderDisiLookup.
-                if (textFieldIndexed && values.stream().allMatch(Objects::isNull)) {
-                    return null;
-                }
-
                 var indexed = values.stream()
                     .map(s -> s == null ? nullValue : s)
                     .filter(Objects::nonNull)