Browse Source

Enable fallback synthetic source for token_count (#109044)

Oleksandr Kolomiiets 1 year ago
parent
commit
42f4294a86

+ 5 - 0
docs/changelog/109044.yaml

@@ -0,0 +1,5 @@
+pr: 109044
+summary: Enable fallback synthetic source for `token_count`
+area: Mapping
+type: feature
+issues: []

+ 1 - 0
docs/reference/mapping/fields/synthetic-source.asciidoc

@@ -64,6 +64,7 @@ types:
 ** <<search-as-you-type-synthetic-source,`search_as_you_type`>>
 ** <<numeric-synthetic-source,`short`>>
 ** <<text-synthetic-source,`text`>>
+** <<token-count-synthetic-source,`token_count`>>
 ** <<version-synthetic-source,`version`>>
 ** <<wildcard-synthetic-source,`wildcard`>>
 

+ 17 - 3
docs/reference/mapping/types/token-count.asciidoc

@@ -64,10 +64,10 @@ The following parameters are accepted by `token_count` fields:
     value. Required. For best performance, use an analyzer without token
     filters.
 
-`enable_position_increments`:: 
+`enable_position_increments`::
 
-Indicates if position increments should be counted. 
-Set to `false` if you don't want to count tokens removed by analyzer filters (like <<analysis-stop-tokenfilter,`stop`>>). 
+Indicates if position increments should be counted.
+Set to `false` if you don't want to count tokens removed by analyzer filters (like <<analysis-stop-tokenfilter,`stop`>>).
 Defaults to `true`.
 
 <<doc-values,`doc_values`>>::
@@ -91,3 +91,17 @@ Defaults to `true`.
     Whether the field value should be stored and retrievable separately from
     the <<mapping-source-field,`_source`>> field. Accepts `true` or `false`
     (default).
+
+[[token-count-synthetic-source]]
+===== Synthetic `_source`
+
+IMPORTANT: Synthetic `_source` is Generally Available only for TSDB indices
+(indices that have `index.mode` set to `time_series`). For other indices
+synthetic `_source` is in technical preview. Features in technical preview may
+be changed or removed in a future release. Elastic will work to fix
+any issues, but features in technical preview are not subject to the support SLA
+of official GA features.
+
+`token_count` fields support <<synthetic-source,synthetic `_source`>> in their
+default configuration. Synthetic `_source` cannot be used together with
+<<copy-to,`copy_to`>>.

+ 5 - 0
modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/TokenCountFieldMapper.java

@@ -215,4 +215,9 @@ public class TokenCountFieldMapper extends FieldMapper {
     public FieldMapper.Builder getMergeBuilder() {
         return new Builder(simpleName()).init(this);
     }
+
+    @Override
+    protected SyntheticSourceMode syntheticSourceMode() {
+        return SyntheticSourceMode.FALLBACK;
+    }
 }

+ 64 - 1
modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/extras/TokenCountFieldMapperTests.java

@@ -33,7 +33,11 @@ import java.io.IOException;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
+import java.util.List;
 import java.util.Map;
+import java.util.Objects;
+import java.util.function.Function;
+import java.util.stream.Collectors;
 
 import static org.hamcrest.Matchers.equalTo;
 
@@ -196,7 +200,66 @@ public class TokenCountFieldMapperTests extends MapperTestCase {
 
     @Override
     protected SyntheticSourceSupport syntheticSourceSupport(boolean ignoreMalformed) {
-        throw new AssumptionViolatedException("not supported");
+        assertFalse(ignoreMalformed);
+
+        var nullValue = usually() ? null : randomNonNegativeInt();
+        return new SyntheticSourceSupport() {
+            @Override
+            public boolean preservesExactSource() {
+                return true;
+            }
+
+            public SyntheticSourceExample example(int maxValues) {
+                if (randomBoolean()) {
+                    var value = generateValue();
+                    return new SyntheticSourceExample(value.text, value.text, value.tokenCount, this::mapping);
+                }
+
+                var values = randomList(1, 5, this::generateValue);
+
+                var textArray = values.stream().map(Value::text).toList();
+
+                var blockExpectedList = values.stream().map(Value::tokenCount).filter(Objects::nonNull).toList();
+                var blockExpected = blockExpectedList.size() == 1 ? blockExpectedList.get(0) : blockExpectedList;
+
+                return new SyntheticSourceExample(textArray, textArray, blockExpected, this::mapping);
+            }
+
+            private record Value(String text, Integer tokenCount) {}
+
+            private Value generateValue() {
+                if (rarely()) {
+                    return new Value(null, null);
+                }
+
+                var text = randomList(0, 10, () -> randomAlphaOfLengthBetween(0, 10)).stream().collect(Collectors.joining(" "));
+                // with keyword analyzer token count is always 1
+                return new Value(text, 1);
+            }
+
+            private void mapping(XContentBuilder b) throws IOException {
+                b.field("type", "token_count").field("analyzer", "keyword");
+                if (rarely()) {
+                    b.field("index", false);
+                }
+                if (rarely()) {
+                    b.field("store", true);
+                }
+                if (nullValue != null) {
+                    b.field("null_value", nullValue);
+                }
+            }
+
+            @Override
+            public List<SyntheticSourceInvalidExample> invalidExample() throws IOException {
+                return List.of();
+            }
+        };
+    }
+
+    protected Function<Object, Object> loadBlockExpected() {
+        // we can get either a number from doc values or null
+        return v -> v != null ? (Number) v : null;
     }
 
     @Override

+ 65 - 0
modules/mapper-extras/src/yamlRestTest/resources/rest-api-spec/test/token_count/10_basic.yml

@@ -0,0 +1,65 @@
+"Test token count":
+  - requires:
+      cluster_features: ["gte_v7.10.0"]
+      reason: "support for token_count was instroduced in 7.10"
+  - do:
+      indices.create:
+        index:  test
+        body:
+          mappings:
+            properties:
+              count:
+                type: token_count
+                analyzer: standard
+              count_without_dv:
+                type: token_count
+                analyzer: standard
+                doc_values: false
+
+  - do:
+      index:
+        index:  test
+        id:     "1"
+        refresh: true
+        body:
+          count: "some text"
+  - do:
+      search:
+        index: test
+        body:
+          fields: [count, count_without_dv]
+
+  - is_true: hits.hits.0._id
+  - match: { hits.hits.0.fields.count: [2] }
+  - is_false: hits.hits.0.fields.count_without_dv
+
+---
+"Synthetic source":
+  - requires:
+      cluster_features: ["mapper.track_ignored_source"]
+      reason: requires tracking ignored source
+  - do:
+      indices.create:
+        index:  test
+        body:
+          mappings:
+            _source:
+              mode: synthetic
+            properties:
+              count:
+                type: token_count
+                analyzer: standard
+
+  - do:
+      index:
+        index:  test
+        id:     "1"
+        refresh: true
+        body:
+          count: "quick brown fox jumps over a lazy dog"
+  - do:
+      get:
+        index: test
+        id: "1"
+
+  - match: { _source.count: "quick brown fox jumps over a lazy dog" }

+ 0 - 35
rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search/330_fetch_fields.yml

@@ -262,41 +262,6 @@
   - match: { hits.hits.0.fields.date.0: "1990/12/29" }
 
 ---
-"Test token count":
-  - requires:
-      cluster_features: ["gte_v7.10.0"]
-      reason: "support for token_count was instroduced in 7.10"
-  - do:
-      indices.create:
-        index:  test
-        body:
-          mappings:
-            properties:
-              count:
-                type: token_count
-                analyzer: standard
-              count_without_dv:
-                type: token_count
-                analyzer: standard
-                doc_values: false
-
-  - do:
-      index:
-        index:  test
-        id:     "1"
-        refresh: true
-        body:
-          count: "some text"
-  - do:
-      search:
-        index: test
-        body:
-          fields: [count, count_without_dv]
-
-  - is_true: hits.hits.0._id
-  - match: { hits.hits.0.fields.count: [2] }
-  - is_false: hits.hits.0.fields.count_without_dv
----
 Test unmapped field:
   -  requires:
         cluster_features: "gte_v7.11.0"