Browse Source

[ML] Add a regex to the output of the categorize_text aggregation (#90723)

The new `regex` field in `categorize_text` output is created in
the same way as the `regex` field that appears in the category
definitions created by anomaly detection jobs that do categorization.

It consists of the terms that occur in the same order for every
message that matches the category, separated with a `.+?` wildcard.
It therefore matches the category messages and enforces the order
of the terms that occurred in the same order for all messages used
to create the category.

It is not recommended to use the regex as the primary mechanism for
searching for the original documents that were categorized. Search
using a regular expression is very slow. Instead the terms of the
category should be used to search for matching documents, as a
terms search can use the inverted index and hence be much faster.
However, there may be situations where it is useful to use the
`regex` field to test whether a small set of messages that have not
been indexed match the category.
David Roberts 3 years ago
parent
commit
bfccd20155

+ 5 - 0
docs/changelog/90723.yaml

@@ -0,0 +1,5 @@
+pr: 90723
+summary: Add a regex to the output of the `categorize_text` aggregation
+area: Machine Learning
+type: enhancement
+issues: []

+ 15 - 0
docs/reference/aggregations/bucket/categorize-text-aggregation.asciidoc

@@ -142,21 +142,25 @@ Response:
         {
           "doc_count" : 3,
           "key" : "Node shutting down",
+          "regex" : ".*?Node.+?shutting.+?down.*?",
           "max_matching_length" : 49
         },
         {
           "doc_count" : 1,
           "key" : "Node starting up",
+          "regex" : ".*?Node.+?starting.+?up.*?",
           "max_matching_length" : 47
         },
         {
           "doc_count" : 1,
           "key" : "User foo_325 logging on",
+          "regex" : ".*?User.+?foo_325.+?logging.+?on.*?",
           "max_matching_length" : 52
         },
         {
           "doc_count" : 1,
           "key" : "User foo_864 logged off",
+          "regex" : ".*?User.+?foo_864.+?logged.+?off.*?",
           "max_matching_length" : 52
         }
       ]
@@ -198,21 +202,25 @@ category results
         {
           "doc_count" : 3,
           "key" : "Node shutting down",
+          "regex" : ".*?Node.+?shutting.+?down.*?",
           "max_matching_length" : 49
         },
         {
           "doc_count" : 1,
           "key" : "Node starting up",
+          "regex" : ".*?Node.+?starting.+?up.*?",
           "max_matching_length" : 47
         },
         {
           "doc_count" : 1,
           "key" : "User logged off",
+          "regex" : ".*?User.+?logged.+?off.*?",
           "max_matching_length" : 52
         },
         {
           "doc_count" : 1,
           "key" : "User logging on",
+          "regex" : ".*?User.+?logging.+?on.*?",
           "max_matching_length" : 52
         }
       ]
@@ -266,11 +274,13 @@ The resulting categories are now very broad, merging the log groups.
         {
           "doc_count" : 4,
           "key" : "Node",
+          "regex" : ".*?Node.*?",
           "max_matching_length" : 49
         },
         {
           "doc_count" : 2,
           "key" : "User",
+          "regex" : ".*?User.*?",
           "max_matching_length" : 52
         }
       ]
@@ -330,6 +340,7 @@ POST log-messages/_search?filter_path=aggregations
               {
                 "doc_count" : 2,
                 "key" : "Node shutting down",
+                "regex" : ".*?Node.+?shutting.+?down.*?",
                 "max_matching_length" : 49,
                 "hit" : {
                   "hits" : {
@@ -357,6 +368,7 @@ POST log-messages/_search?filter_path=aggregations
               {
                 "doc_count" : 1,
                 "key" : "Node starting up",
+                "regex" : ".*?Node.+?starting.+?up.*?",
                 "max_matching_length" : 47,
                 "hit" : {
                   "hits" : {
@@ -393,6 +405,7 @@ POST log-messages/_search?filter_path=aggregations
               {
                 "doc_count" : 1,
                 "key" : "Node shutting down",
+                "regex" : ".*?Node.+?shutting.+?down.*?",
                 "max_matching_length" : 49,
                 "hit" : {
                   "hits" : {
@@ -420,6 +433,7 @@ POST log-messages/_search?filter_path=aggregations
               {
                 "doc_count" : 1,
                 "key" : "User logged off",
+                "regex" : ".*?User.+?logged.+?off.*?",
                 "max_matching_length" : 52,
                 "hit" : {
                   "hits" : {
@@ -447,6 +461,7 @@ POST log-messages/_search?filter_path=aggregations
               {
                 "doc_count" : 1,
                 "key" : "User logging on",
+                "regex" : ".*?User.+?logging.+?on.*?",
                 "max_matching_length" : 52,
                 "hit" : {
                   "hits" : {

+ 1 - 0
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/categorization/InternalCategorizationAggregation.java

@@ -145,6 +145,7 @@ public class InternalCategorizationAggregation extends InternalMultiBucketAggreg
             builder.field(CommonFields.DOC_COUNT.getPreferredName(), serializableCategory.getNumMatches());
             builder.field(CommonFields.KEY.getPreferredName());
             key.toXContent(builder, params);
+            builder.field(CategoryDefinition.REGEX.getPreferredName(), serializableCategory.getRegex());
             builder.field(CategoryDefinition.MAX_MATCHING_LENGTH.getPreferredName(), serializableCategory.maxMatchingStringLen());
             aggregations.toXContentInternal(builder, params);
             builder.endObject();

+ 12 - 0
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/categorization/SerializableTokenListCategory.java

@@ -37,6 +37,8 @@ public class SerializableTokenListCategory implements Writeable {
      */
     public static final int KEY_BUDGET = 10000;
 
+    private static final String REGEX_NEEDS_ESCAPE_PATTERN = "([\\\\|()\\[\\]{}^$.+*?])";
+
     final BytesRef[] baseTokens;
     final int[] baseTokenWeights;
     final int baseUnfilteredLength;
@@ -160,6 +162,16 @@ public class SerializableTokenListCategory implements Writeable {
         return Arrays.stream(keyTokenIndexes).mapToObj(index -> baseTokens[index]).toArray(BytesRef[]::new);
     }
 
+    public String getRegex() {
+        if (keyTokenIndexes.length == 0 || orderedCommonTokenBeginIndex == orderedCommonTokenEndIndex) {
+            return ".*";
+        }
+        return Arrays.stream(keyTokenIndexes)
+            .filter(index -> index >= orderedCommonTokenBeginIndex && index < orderedCommonTokenEndIndex)
+            .mapToObj(index -> baseTokens[index].utf8ToString().replaceAll(REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1"))
+            .collect(Collectors.joining(".+?", ".*?", ".*?"));
+    }
+
     @Override
     public void writeTo(StreamOutput out) throws IOException {
         out.writeArray(StreamOutput::writeBytesRef, baseTokens);

+ 44 - 0
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/aggs/categorization/CategorizeTextAggregatorTests.java

@@ -64,11 +64,18 @@ public class CategorizeTextAggregatorTests extends AggregatorTestCase {
             assertThat(result.getBuckets(), hasSize(2));
             assertThat(result.getBuckets().get(0).getDocCount(), equalTo(6L));
             assertThat(result.getBuckets().get(0).getKeyAsString(), equalTo("Node started"));
+            assertThat(result.getBuckets().get(0).getSerializableCategory().maxMatchingStringLen(), equalTo(15));
+            assertThat(result.getBuckets().get(0).getSerializableCategory().getRegex(), equalTo(".*?Node.+?started.*?"));
             assertThat(result.getBuckets().get(1).getDocCount(), equalTo(2L));
             assertThat(
                 result.getBuckets().get(1).getKeyAsString(),
                 equalTo("Failed to shutdown error org.aaaa.bbbb.Cccc line caused by foo exception")
             );
+            assertThat(result.getBuckets().get(1).getSerializableCategory().maxMatchingStringLen(), equalTo(84));
+            assertThat(
+                result.getBuckets().get(1).getSerializableCategory().getRegex(),
+                equalTo(".*?Failed.+?to.+?shutdown.+?error.+?org\\.aaaa\\.bbbb\\.Cccc.+?line.+?caused.+?by.+?foo.+?exception.*?")
+            );
         },
             new AggTestConfig(
                 new CategorizeTextAggregationBuilder("my_agg", TEXT_FIELD_NAME),
@@ -88,6 +95,8 @@ public class CategorizeTextAggregatorTests extends AggregatorTestCase {
             assertThat(result.getBuckets(), hasSize(2));
             assertThat(result.getBuckets().get(0).getDocCount(), equalTo(6L));
             assertThat(result.getBuckets().get(0).getKeyAsString(), equalTo("Node started"));
+            assertThat(result.getBuckets().get(0).getSerializableCategory().maxMatchingStringLen(), equalTo(15));
+            assertThat(result.getBuckets().get(0).getSerializableCategory().getRegex(), equalTo(".*?Node.+?started.*?"));
             assertThat(((Max) result.getBuckets().get(0).getAggregations().get("max")).value(), equalTo(5.0));
             assertThat(((Min) result.getBuckets().get(0).getAggregations().get("min")).value(), equalTo(0.0));
             assertThat(((Avg) result.getBuckets().get(0).getAggregations().get("avg")).getValue(), equalTo(2.5));
@@ -97,6 +106,11 @@ public class CategorizeTextAggregatorTests extends AggregatorTestCase {
                 result.getBuckets().get(1).getKeyAsString(),
                 equalTo("Failed to shutdown error org.aaaa.bbbb.Cccc line caused by foo exception")
             );
+            assertThat(result.getBuckets().get(1).getSerializableCategory().maxMatchingStringLen(), equalTo(84));
+            assertThat(
+                result.getBuckets().get(1).getSerializableCategory().getRegex(),
+                equalTo(".*?Failed.+?to.+?shutdown.+?error.+?org\\.aaaa\\.bbbb\\.Cccc.+?line.+?caused.+?by.+?foo.+?exception.*?")
+            );
             assertThat(((Max) result.getBuckets().get(1).getAggregations().get("max")).value(), equalTo(4.0));
             assertThat(((Min) result.getBuckets().get(1).getAggregations().get("min")).value(), equalTo(0.0));
             assertThat(((Avg) result.getBuckets().get(1).getAggregations().get("avg")).getValue(), equalTo(2.0));
@@ -115,6 +129,8 @@ public class CategorizeTextAggregatorTests extends AggregatorTestCase {
             assertThat(result.getBuckets(), hasSize(2));
             assertThat(result.getBuckets().get(0).getDocCount(), equalTo(6L));
             assertThat(result.getBuckets().get(0).getKeyAsString(), equalTo("Node started"));
+            assertThat(result.getBuckets().get(0).getSerializableCategory().maxMatchingStringLen(), equalTo(15));
+            assertThat(result.getBuckets().get(0).getSerializableCategory().getRegex(), equalTo(".*?Node.+?started.*?"));
             Histogram histo = result.getBuckets().get(0).getAggregations().get("histo");
             assertThat(histo.getBuckets(), hasSize(3));
             for (Histogram.Bucket bucket : histo.getBuckets()) {
@@ -135,6 +151,11 @@ public class CategorizeTextAggregatorTests extends AggregatorTestCase {
                 result.getBuckets().get(1).getKeyAsString(),
                 equalTo("Failed to shutdown error org.aaaa.bbbb.Cccc line caused by foo exception")
             );
+            assertThat(result.getBuckets().get(1).getSerializableCategory().maxMatchingStringLen(), equalTo(84));
+            assertThat(
+                result.getBuckets().get(1).getSerializableCategory().getRegex(),
+                equalTo(".*?Failed.+?to.+?shutdown.+?error.+?org\\.aaaa\\.bbbb\\.Cccc.+?line.+?caused.+?by.+?foo.+?exception.*?")
+            );
             histo = result.getBuckets().get(1).getAggregations().get("histo");
             assertThat(histo.getBuckets(), hasSize(3));
             assertThat(histo.getBuckets().get(0).getDocCount(), equalTo(1L));
@@ -167,6 +188,8 @@ public class CategorizeTextAggregatorTests extends AggregatorTestCase {
             assertThat(categorizationAggregation.getBuckets(), hasSize(2));
             assertThat(categorizationAggregation.getBuckets().get(0).getDocCount(), equalTo(2L));
             assertThat(categorizationAggregation.getBuckets().get(0).getKeyAsString(), equalTo("Node started"));
+            assertThat(categorizationAggregation.getBuckets().get(0).getSerializableCategory().maxMatchingStringLen(), equalTo(15));
+            assertThat(categorizationAggregation.getBuckets().get(0).getSerializableCategory().getRegex(), equalTo(".*?Node.+?started.*?"));
             assertThat(((Max) categorizationAggregation.getBuckets().get(0).getAggregations().get("max")).value(), equalTo(1.0));
             assertThat(((Min) categorizationAggregation.getBuckets().get(0).getAggregations().get("min")).value(), equalTo(0.0));
             assertThat(((Avg) categorizationAggregation.getBuckets().get(0).getAggregations().get("avg")).getValue(), equalTo(0.5));
@@ -176,6 +199,11 @@ public class CategorizeTextAggregatorTests extends AggregatorTestCase {
                 categorizationAggregation.getBuckets().get(1).getKeyAsString(),
                 equalTo("Failed to shutdown error org.aaaa.bbbb.Cccc line caused by foo exception")
             );
+            assertThat(categorizationAggregation.getBuckets().get(1).getSerializableCategory().maxMatchingStringLen(), equalTo(84));
+            assertThat(
+                categorizationAggregation.getBuckets().get(1).getSerializableCategory().getRegex(),
+                equalTo(".*?Failed.+?to.+?shutdown.+?error.+?org\\.aaaa\\.bbbb\\.Cccc.+?line.+?caused.+?by.+?foo.+?exception.*?")
+            );
             assertThat(((Max) categorizationAggregation.getBuckets().get(1).getAggregations().get("max")).value(), equalTo(0.0));
             assertThat(((Min) categorizationAggregation.getBuckets().get(1).getAggregations().get("min")).value(), equalTo(0.0));
             assertThat(((Avg) categorizationAggregation.getBuckets().get(1).getAggregations().get("avg")).getValue(), equalTo(0.0));
@@ -186,6 +214,8 @@ public class CategorizeTextAggregatorTests extends AggregatorTestCase {
             assertThat(categorizationAggregation.getBuckets(), hasSize(1));
             assertThat(categorizationAggregation.getBuckets().get(0).getDocCount(), equalTo(2L));
             assertThat(categorizationAggregation.getBuckets().get(0).getKeyAsString(), equalTo("Node started"));
+            assertThat(categorizationAggregation.getBuckets().get(0).getSerializableCategory().maxMatchingStringLen(), equalTo(15));
+            assertThat(categorizationAggregation.getBuckets().get(0).getSerializableCategory().getRegex(), equalTo(".*?Node.+?started.*?"));
             assertThat(((Max) categorizationAggregation.getBuckets().get(0).getAggregations().get("max")).value(), equalTo(3.0));
             assertThat(((Min) categorizationAggregation.getBuckets().get(0).getAggregations().get("min")).value(), equalTo(2.0));
             assertThat(((Avg) categorizationAggregation.getBuckets().get(0).getAggregations().get("avg")).getValue(), equalTo(2.5));
@@ -196,6 +226,8 @@ public class CategorizeTextAggregatorTests extends AggregatorTestCase {
             assertThat(categorizationAggregation.getBuckets(), hasSize(2));
             assertThat(categorizationAggregation.getBuckets().get(0).getDocCount(), equalTo(2L));
             assertThat(categorizationAggregation.getBuckets().get(0).getKeyAsString(), equalTo("Node started"));
+            assertThat(categorizationAggregation.getBuckets().get(0).getSerializableCategory().maxMatchingStringLen(), equalTo(15));
+            assertThat(categorizationAggregation.getBuckets().get(0).getSerializableCategory().getRegex(), equalTo(".*?Node.+?started.*?"));
             assertThat(((Max) categorizationAggregation.getBuckets().get(0).getAggregations().get("max")).value(), equalTo(5.0));
             assertThat(((Min) categorizationAggregation.getBuckets().get(0).getAggregations().get("min")).value(), equalTo(4.0));
             assertThat(((Avg) categorizationAggregation.getBuckets().get(0).getAggregations().get("avg")).getValue(), equalTo(4.5));
@@ -205,6 +237,11 @@ public class CategorizeTextAggregatorTests extends AggregatorTestCase {
                 categorizationAggregation.getBuckets().get(1).getKeyAsString(),
                 equalTo("Failed to shutdown error org.aaaa.bbbb.Cccc line caused by foo exception")
             );
+            assertThat(categorizationAggregation.getBuckets().get(1).getSerializableCategory().maxMatchingStringLen(), equalTo(84));
+            assertThat(
+                categorizationAggregation.getBuckets().get(1).getSerializableCategory().getRegex(),
+                equalTo(".*?Failed.+?to.+?shutdown.+?error.+?org\\.aaaa\\.bbbb\\.Cccc.+?line.+?caused.+?by.+?foo.+?exception.*?")
+            );
             assertThat(((Max) categorizationAggregation.getBuckets().get(1).getAggregations().get("max")).value(), equalTo(4.0));
             assertThat(((Min) categorizationAggregation.getBuckets().get(1).getAggregations().get("min")).value(), equalTo(4.0));
             assertThat(((Avg) categorizationAggregation.getBuckets().get(1).getAggregations().get("avg")).getValue(), equalTo(4.0));
@@ -223,6 +260,8 @@ public class CategorizeTextAggregatorTests extends AggregatorTestCase {
             assertThat(result.getBuckets(), hasSize(2));
             assertThat(result.getBuckets().get(0).getDocCount(), equalTo(30000L));
             assertThat(result.getBuckets().get(0).getKeyAsString(), equalTo("Node started"));
+            assertThat(result.getBuckets().get(0).getSerializableCategory().maxMatchingStringLen(), equalTo(15));
+            assertThat(result.getBuckets().get(0).getSerializableCategory().getRegex(), equalTo(".*?Node.+?started.*?"));
             Histogram histo = result.getBuckets().get(0).getAggregations().get("histo");
             assertThat(histo.getBuckets(), hasSize(3));
             for (Histogram.Bucket bucket : histo.getBuckets()) {
@@ -243,6 +282,11 @@ public class CategorizeTextAggregatorTests extends AggregatorTestCase {
                 result.getBuckets().get(1).getKeyAsString(),
                 equalTo("Failed to shutdown error org.aaaa.bbbb.Cccc line caused by foo exception")
             );
+            assertThat(result.getBuckets().get(1).getSerializableCategory().maxMatchingStringLen(), equalTo(84));
+            assertThat(
+                result.getBuckets().get(1).getSerializableCategory().getRegex(),
+                equalTo(".*?Failed.+?to.+?shutdown.+?error.+?org\\.aaaa\\.bbbb\\.Cccc.+?line.+?caused.+?by.+?foo.+?exception.*?")
+            );
             histo = result.getBuckets().get(1).getAggregations().get("histo");
             assertThat(histo.getBuckets(), hasSize(3));
             assertThat(histo.getBuckets().get(0).getDocCount(), equalTo(5000L));

+ 8 - 0
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/aggs/categorization/ParsedCategorization.java

@@ -57,6 +57,7 @@ class ParsedCategorization extends ParsedMultiBucketAggregation<ParsedCategoriza
     public static class ParsedBucket extends ParsedMultiBucketAggregation.ParsedBucket implements MultiBucketsAggregation.Bucket {
 
         private InternalCategorizationAggregation.BucketKey key;
+        private String regex;
         private int maxMatchingLength;
 
         protected void setKeyAsString(String keyAsString) {
@@ -76,6 +77,10 @@ class ParsedCategorization extends ParsedMultiBucketAggregation<ParsedCategoriza
             );
         }
 
+        private void setRegex(String regex) {
+            this.regex = regex;
+        }
+
         private void setMaxMatchingLength(int maxMatchingLength) {
             this.maxMatchingLength = maxMatchingLength;
         }
@@ -99,6 +104,7 @@ class ParsedCategorization extends ParsedMultiBucketAggregation<ParsedCategoriza
         public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
             builder.startObject();
             keyToXContent(builder);
+            builder.field(CategoryDefinition.REGEX.getPreferredName(), regex);
             builder.field(CategoryDefinition.MAX_MATCHING_LENGTH.getPreferredName(), maxMatchingLength);
             builder.field(CommonFields.DOC_COUNT.getPreferredName(), getDocCount());
             getAggregations().toXContentInternal(builder, params);
@@ -142,6 +148,8 @@ class ParsedCategorization extends ParsedMultiBucketAggregation<ParsedCategoriza
                         keyConsumer.accept(parser, bucket);
                     } else if (CommonFields.DOC_COUNT.getPreferredName().equals(currentFieldName)) {
                         bucket.setDocCount(parser.longValue());
+                    } else if (CategoryDefinition.REGEX.getPreferredName().equals(currentFieldName)) {
+                        bucket.setRegex(parser.text());
                     } else if (CategoryDefinition.MAX_MATCHING_LENGTH.getPreferredName().equals(currentFieldName)) {
                         bucket.setMaxMatchingLength(parser.intValue());
                     }

+ 47 - 0
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/aggs/categorization/SerializableTokenListCategoryTests.java

@@ -7,6 +7,7 @@
 
 package org.elasticsearch.xpack.ml.aggs.categorization;
 
+import org.apache.lucene.util.BytesRef;
 import org.elasticsearch.common.io.stream.Writeable;
 import org.elasticsearch.common.util.BigArrays;
 import org.elasticsearch.common.util.BytesRefHash;
@@ -14,6 +15,13 @@ import org.elasticsearch.test.AbstractWireSerializingTestCase;
 import org.junit.After;
 import org.junit.Before;
 
+import java.util.ArrayList;
+import java.util.List;
+import java.util.TreeMap;
+import java.util.stream.Collectors;
+
+import static org.hamcrest.Matchers.equalTo;
+
 public class SerializableTokenListCategoryTests extends AbstractWireSerializingTestCase<SerializableTokenListCategory> {
 
     private CategorizationBytesRefHash bytesRefHash;
@@ -28,6 +36,45 @@ public class SerializableTokenListCategoryTests extends AbstractWireSerializingT
         bytesRefHash.close();
     }
 
+    public void testGetRegex() {
+        StringBuilder expectedResult = new StringBuilder();
+
+        int unfilteredStringLength = 0;
+        int numBaseTokens = randomIntBetween(3, 15);
+        List<TokenListCategory.TokenAndWeight> baseWeightedTokenIds = new ArrayList<>();
+        for (int i = 0; i < numBaseTokens; ++i) {
+            int stringLen = randomIntBetween(3, 20);
+            unfilteredStringLength += stringLen;
+            String string = randomAlphaOfLength(stringLen);
+            BytesRef token = new BytesRef(string);
+            baseWeightedTokenIds.add(new TokenListCategory.TokenAndWeight(bytesRefHash.put(token), randomIntBetween(1, 5)));
+            expectedResult.append(i == 0 ? ".*?" : ".+?").append(string);
+        }
+        expectedResult.append(".*?");
+        unfilteredStringLength += randomIntBetween(numBaseTokens, numBaseTokens + 100);
+        List<TokenListCategory.TokenAndWeight> uniqueWeightedTokenIds = baseWeightedTokenIds.stream()
+            .collect(
+                Collectors.groupingBy(
+                    TokenListCategory.TokenAndWeight::getTokenId,
+                    TreeMap::new,
+                    Collectors.summingInt(TokenListCategory.TokenAndWeight::getWeight)
+                )
+            )
+            .entrySet()
+            .stream()
+            .map(entry -> new TokenListCategory.TokenAndWeight(entry.getKey(), entry.getValue()))
+            .toList();
+        TokenListCategory category = new TokenListCategory(
+            1,
+            unfilteredStringLength,
+            baseWeightedTokenIds,
+            uniqueWeightedTokenIds,
+            randomLongBetween(1, 10)
+        );
+
+        assertThat(new SerializableTokenListCategory(category, bytesRefHash).getRegex(), equalTo(expectedResult.toString()));
+    }
+
     @Override
     protected Writeable.Reader<SerializableTokenListCategory> instanceReader() {
         return SerializableTokenListCategory::new;

+ 5 - 0
x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/categorization_agg.yml

@@ -80,8 +80,13 @@ setup:
   - length: { aggregations.categories.buckets: 2 }
   - match: { aggregations.categories.buckets.0.doc_count: 4 }
   - match: { aggregations.categories.buckets.0.key: "Node" }
+  - match: { aggregations.categories.buckets.0.regex: ".*?Node.*?" }
+  - match: { aggregations.categories.buckets.0.max_matching_length: 16 }
   - match: { aggregations.categories.buckets.1.doc_count: 3 }
   - match: { aggregations.categories.buckets.1.key: "User Foo logging" }
+  - match: { aggregations.categories.buckets.1.regex: ".*?User.+?Foo.+?logging.*?" }
+  - match: { aggregations.categories.buckets.1.max_matching_length: 22 }
+
 ---
 "Test categorization aggregation against unsupported field":
   - do: