Browse Source

ES|QL categorize options (#131104)

* ES|QL categorize options

* refactor options

* fix serialization

* polish

* add verfications

* better test coverage + polish code

* better test coverage + polish code
Jan Kuipers 2 months ago
parent
commit
ec7f77becb
31 changed files with 572 additions and 215 deletions
  1. 1 1
      benchmarks/src/main/java/org/elasticsearch/benchmark/compute/operator/AggregatorBenchmark.java
  2. 13 0
      docs/reference/query-languages/esql/_snippets/functions/functionNamedParams/categorize.md
  3. 3 0
      docs/reference/query-languages/esql/_snippets/functions/layout/categorize.md
  4. 3 0
      docs/reference/query-languages/esql/_snippets/functions/parameters/categorize.md
  5. 4 4
      docs/reference/query-languages/esql/_snippets/functions/types/categorize.md
  6. 1 1
      docs/reference/query-languages/esql/images/functions/categorize.svg
  7. 1 0
      server/src/main/java/org/elasticsearch/TransportVersions.java
  8. 23 7
      x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/BlockHash.java
  9. 28 11
      x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHash.java
  10. 9 1
      x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizePackedValuesBlockHash.java
  11. 74 36
      x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHashTests.java
  12. 14 5
      x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/CategorizePackedValuesBlockHashTests.java
  13. 1 1
      x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/TopNBlockHashTests.java
  14. 2 2
      x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/operator/HashAggregationOperatorTests.java
  15. 80 1
      x-pack/plugin/esql/qa/testFixtures/src/main/resources/categorize.csv-spec
  16. 9 2
      x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java
  17. 107 0
      x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/Options.java
  18. 0 71
      x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/fulltext/FullTextFunction.java
  19. 5 7
      x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/fulltext/Match.java
  20. 3 7
      x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/fulltext/MatchPhrase.java
  21. 2 1
      x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/fulltext/MultiMatch.java
  22. 3 7
      x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/fulltext/QueryString.java
  23. 111 8
      x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/grouping/Categorize.java
  24. 4 34
      x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/vector/Knn.java
  25. 4 3
      x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/ReplaceAggregateNestedExpressionWithEval.java
  26. 6 2
      x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/AbstractPhysicalOperationProviders.java
  27. 51 0
      x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java
  28. 1 1
      x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/grouping/CategorizeErrorTests.java
  29. 1 1
      x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/grouping/CategorizeTests.java
  30. 1 1
      x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/FoldNullTests.java
  31. 7 0
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/categorization/SerializableTokenListCategory.java

+ 1 - 1
benchmarks/src/main/java/org/elasticsearch/benchmark/compute/operator/AggregatorBenchmark.java

@@ -191,7 +191,7 @@ public class AggregatorBenchmark {
                 new BlockHash.GroupSpec(2, ElementType.BYTES_REF)
             );
             case TOP_N_LONGS -> List.of(
-                new BlockHash.GroupSpec(0, ElementType.LONG, false, new BlockHash.TopNDef(0, true, true, TOP_N_LIMIT))
+                new BlockHash.GroupSpec(0, ElementType.LONG, null, new BlockHash.TopNDef(0, true, true, TOP_N_LIMIT))
             );
             default -> throw new IllegalArgumentException("unsupported grouping [" + grouping + "]");
         };

+ 13 - 0
docs/reference/query-languages/esql/_snippets/functions/functionNamedParams/categorize.md

@@ -0,0 +1,13 @@
+% This is generated by ESQL's AbstractFunctionTestCase. Do not edit it. See ../README.md for how to regenerate it.
+
+**Supported function named parameters**
+
+`output_format`
+:   (boolean) The output format of the categories. Defaults to regex.
+
+`similarity_threshold`
+:   (boolean) The minimum percentage of token weight that must match for text to be added to the category bucket. Must be between 1 and 100. The larger the value the narrower the categories. Larger values will increase memory usage and create narrower categories. Defaults to 70.
+
+`analyzer`
+:   (keyword) Analyzer used to convert the field into tokens for text categorization.
+

+ 3 - 0
docs/reference/query-languages/esql/_snippets/functions/layout/categorize.md

@@ -19,5 +19,8 @@
 :::{include} ../types/categorize.md
 :::
 
+:::{include} ../functionNamedParams/categorize.md
+:::
+
 :::{include} ../examples/categorize.md
 :::

+ 3 - 0
docs/reference/query-languages/esql/_snippets/functions/parameters/categorize.md

@@ -5,3 +5,6 @@
 `field`
 :   Expression to categorize
 
+`options`
+:   (Optional) Categorize additional options as [function named parameters](/reference/query-languages/esql/esql-syntax.md#esql-function-named-params).
+

+ 4 - 4
docs/reference/query-languages/esql/_snippets/functions/types/categorize.md

@@ -2,8 +2,8 @@
 
 **Supported types**
 
-| field | result |
-| --- | --- |
-| keyword | keyword |
-| text | keyword |
+| field | options | result |
+| --- | --- | --- |
+| keyword | | keyword |
+| text | | keyword |
 

+ 1 - 1
docs/reference/query-languages/esql/images/functions/categorize.svg

@@ -1 +1 @@
-<svg version="1.1" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg" width="324" height="46" viewbox="0 0 324 46"><defs><style type="text/css">.c{fill:none;stroke:#222222;}.k{fill:#000000;font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;font-size:20px;}.s{fill:#e4f4ff;stroke:#222222;}.syn{fill:#8D8D8D;font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;font-size:20px;}</style></defs><path class="c" d="M0 31h5m140 0h10m32 0h10m80 0h10m32 0h5"/><rect class="s" x="5" y="5" width="140" height="36"/><text class="k" x="15" y="31">CATEGORIZE</text><rect class="s" x="155" y="5" width="32" height="36" rx="7"/><text class="syn" x="165" y="31">(</text><rect class="s" x="197" y="5" width="80" height="36" rx="7"/><text class="k" x="207" y="31">field</text><rect class="s" x="287" y="5" width="32" height="36" rx="7"/><text class="syn" x="297" y="31">)</text></svg>
+<svg version="1.1" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg" width="520" height="61" viewbox="0 0 520 61"><defs><style type="text/css">.c{fill:none;stroke:#222222;}.k{fill:#000000;font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;font-size:20px;}.s{fill:#e4f4ff;stroke:#222222;}.syn{fill:#8D8D8D;font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;font-size:20px;}</style></defs><path class="c" d="M0 31h5m140 0h10m32 0h10m80 0h10m32 0h30m104 0h20m-139 0q5 0 5 5v10q0 5 5 5h114q5 0 5-5v-10q0-5 5-5m5 0h10m32 0h5"/><rect class="s" x="5" y="5" width="140" height="36"/><text class="k" x="15" y="31">CATEGORIZE</text><rect class="s" x="155" y="5" width="32" height="36" rx="7"/><text class="syn" x="165" y="31">(</text><rect class="s" x="197" y="5" width="80" height="36" rx="7"/><text class="k" x="207" y="31">field</text><rect class="s" x="287" y="5" width="32" height="36" rx="7"/><text class="syn" x="297" y="31">,</text><rect class="s" x="349" y="5" width="104" height="36" rx="7"/><text class="k" x="359" y="31">options</text><rect class="s" x="483" y="5" width="32" height="36" rx="7"/><text class="syn" x="493" y="31">)</text></svg>

+ 1 - 0
server/src/main/java/org/elasticsearch/TransportVersions.java

@@ -340,6 +340,7 @@ public class TransportVersions {
     public static final TransportVersion ESQL_FIXED_INDEX_LIKE = def(9_119_0_00);
     public static final TransportVersion LOOKUP_JOIN_CCS = def(9_120_0_00);
     public static final TransportVersion NODE_USAGE_STATS_FOR_THREAD_POOLS_IN_CLUSTER_INFO = def(9_121_0_00);
+    public static final TransportVersion ESQL_CATEGORIZE_OPTIONS = def(9_122_0_00);
 
     /*
      * STOP! READ THIS FIRST! No, really,

+ 23 - 7
x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/BlockHash.java

@@ -128,16 +128,26 @@ public abstract class BlockHash implements Releasable, SeenGroupIds {
     public record TopNDef(int order, boolean asc, boolean nullsFirst, int limit) {}
 
     /**
-     * @param isCategorize Whether this group is a CATEGORIZE() or not.
-     *                     May be changed in the future when more stateful grouping functions are added.
+     * Configuration for a BlockHash group spec that is doing text categorization.
      */
-    public record GroupSpec(int channel, ElementType elementType, boolean isCategorize, @Nullable TopNDef topNDef) {
+    public record CategorizeDef(String analyzer, OutputFormat outputFormat, int similarityThreshold) {
+        public enum OutputFormat {
+            REGEX,
+            TOKENS
+        }
+    }
+
+    public record GroupSpec(int channel, ElementType elementType, @Nullable CategorizeDef categorizeDef, @Nullable TopNDef topNDef) {
         public GroupSpec(int channel, ElementType elementType) {
-            this(channel, elementType, false, null);
+            this(channel, elementType, null, null);
+        }
+
+        public GroupSpec(int channel, ElementType elementType, CategorizeDef categorizeDef) {
+            this(channel, elementType, categorizeDef, null);
         }
 
-        public GroupSpec(int channel, ElementType elementType, boolean isCategorize) {
-            this(channel, elementType, isCategorize, null);
+        public boolean isCategorize() {
+            return categorizeDef != null;
         }
     }
 
@@ -207,7 +217,13 @@ public abstract class BlockHash implements Releasable, SeenGroupIds {
         int emitBatchSize
     ) {
         if (groups.size() == 1) {
-            return new CategorizeBlockHash(blockFactory, groups.get(0).channel, aggregatorMode, analysisRegistry);
+            return new CategorizeBlockHash(
+                blockFactory,
+                groups.get(0).channel,
+                aggregatorMode,
+                groups.get(0).categorizeDef,
+                analysisRegistry
+            );
         } else {
             assert groups.get(0).isCategorize();
             assert groups.subList(1, groups.size()).stream().noneMatch(GroupSpec::isCategorize);

+ 28 - 11
x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHash.java

@@ -18,7 +18,6 @@ import org.elasticsearch.common.util.BitArray;
 import org.elasticsearch.common.util.BytesRefHash;
 import org.elasticsearch.compute.aggregation.AggregatorMode;
 import org.elasticsearch.compute.aggregation.GroupingAggregatorFunction;
-import org.elasticsearch.compute.aggregation.SeenGroupIds;
 import org.elasticsearch.compute.data.Block;
 import org.elasticsearch.compute.data.BlockFactory;
 import org.elasticsearch.compute.data.BytesRefBlock;
@@ -47,12 +46,13 @@ import java.util.Objects;
  */
 public class CategorizeBlockHash extends BlockHash {
 
-    private static final CategorizationAnalyzerConfig ANALYZER_CONFIG = CategorizationAnalyzerConfig
+    private static final CategorizationAnalyzerConfig DEFAULT_ANALYZER_CONFIG = CategorizationAnalyzerConfig
         .buildStandardEsqlCategorizationAnalyzer();
     private static final int NULL_ORD = 0;
 
     private final int channel;
     private final AggregatorMode aggregatorMode;
+    private final CategorizeDef categorizeDef;
     private final TokenListCategorizer.CloseableTokenListCategorizer categorizer;
     private final CategorizeEvaluator evaluator;
 
@@ -64,28 +64,38 @@ public class CategorizeBlockHash extends BlockHash {
      */
     private boolean seenNull = false;
 
-    CategorizeBlockHash(BlockFactory blockFactory, int channel, AggregatorMode aggregatorMode, AnalysisRegistry analysisRegistry) {
+    CategorizeBlockHash(
+        BlockFactory blockFactory,
+        int channel,
+        AggregatorMode aggregatorMode,
+        CategorizeDef categorizeDef,
+        AnalysisRegistry analysisRegistry
+    ) {
         super(blockFactory);
 
         this.channel = channel;
         this.aggregatorMode = aggregatorMode;
+        this.categorizeDef = categorizeDef;
 
         this.categorizer = new TokenListCategorizer.CloseableTokenListCategorizer(
             new CategorizationBytesRefHash(new BytesRefHash(2048, blockFactory.bigArrays())),
             CategorizationPartOfSpeechDictionary.getInstance(),
-            0.70f
+            categorizeDef.similarityThreshold() / 100.0f
         );
 
         if (aggregatorMode.isInputPartial() == false) {
-            CategorizationAnalyzer analyzer;
+            CategorizationAnalyzer categorizationAnalyzer;
             try {
                 Objects.requireNonNull(analysisRegistry);
-                analyzer = new CategorizationAnalyzer(analysisRegistry, ANALYZER_CONFIG);
-            } catch (Exception e) {
+                CategorizationAnalyzerConfig config = categorizeDef.analyzer() == null
+                    ? DEFAULT_ANALYZER_CONFIG
+                    : new CategorizationAnalyzerConfig.Builder().setAnalyzer(categorizeDef.analyzer()).build();
+                categorizationAnalyzer = new CategorizationAnalyzer(analysisRegistry, config);
+            } catch (IOException e) {
                 categorizer.close();
                 throw new RuntimeException(e);
             }
-            this.evaluator = new CategorizeEvaluator(analyzer);
+            this.evaluator = new CategorizeEvaluator(categorizationAnalyzer);
         } else {
             this.evaluator = null;
         }
@@ -114,7 +124,7 @@ public class CategorizeBlockHash extends BlockHash {
 
     @Override
     public BitArray seenGroupIds(BigArrays bigArrays) {
-        return new SeenGroupIds.Range(seenNull ? 0 : 1, Math.toIntExact(categorizer.getCategoryCount() + 1)).seenGroupIds(bigArrays);
+        return new Range(seenNull ? 0 : 1, Math.toIntExact(categorizer.getCategoryCount() + 1)).seenGroupIds(bigArrays);
     }
 
     @Override
@@ -222,7 +232,7 @@ public class CategorizeBlockHash extends BlockHash {
             try (BytesRefBlock.Builder result = blockFactory.newBytesRefBlockBuilder(categorizer.getCategoryCount())) {
                 result.appendNull();
                 for (SerializableTokenListCategory category : categorizer.toCategoriesById()) {
-                    scratch.copyChars(category.getRegex());
+                    scratch.copyChars(getKeyString(category));
                     result.appendBytesRef(scratch.get());
                     scratch.clear();
                 }
@@ -232,7 +242,7 @@ public class CategorizeBlockHash extends BlockHash {
 
         try (BytesRefVector.Builder result = blockFactory.newBytesRefVectorBuilder(categorizer.getCategoryCount())) {
             for (SerializableTokenListCategory category : categorizer.toCategoriesById()) {
-                scratch.copyChars(category.getRegex());
+                scratch.copyChars(getKeyString(category));
                 result.appendBytesRef(scratch.get());
                 scratch.clear();
             }
@@ -240,6 +250,13 @@ public class CategorizeBlockHash extends BlockHash {
         }
     }
 
+    private String getKeyString(SerializableTokenListCategory category) {
+        return switch (categorizeDef.outputFormat()) {
+            case REGEX -> category.getRegex();
+            case TOKENS -> category.getKeyTokensString();
+        };
+    }
+
     /**
      * Similar implementation to an Evaluator.
      */

+ 9 - 1
x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizePackedValuesBlockHash.java

@@ -56,6 +56,8 @@ public class CategorizePackedValuesBlockHash extends BlockHash {
         int emitBatchSize
     ) {
         super(blockFactory);
+        assert specs.get(0).categorizeDef() != null;
+
         this.specs = specs;
         this.aggregatorMode = aggregatorMode;
         blocks = new Block[specs.size()];
@@ -68,7 +70,13 @@ public class CategorizePackedValuesBlockHash extends BlockHash {
 
         boolean success = false;
         try {
-            categorizeBlockHash = new CategorizeBlockHash(blockFactory, specs.get(0).channel(), aggregatorMode, analysisRegistry);
+            categorizeBlockHash = new CategorizeBlockHash(
+                blockFactory,
+                specs.get(0).channel(),
+                aggregatorMode,
+                specs.get(0).categorizeDef(),
+                analysisRegistry
+            );
             packedValuesBlockHash = new PackedValuesBlockHash(delegateSpecs, blockFactory, emitBatchSize);
             success = true;
         } finally {

+ 74 - 36
x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHashTests.java

@@ -76,7 +76,13 @@ public class CategorizeBlockHashTests extends BlockHashTestCase {
         ).getAnalysisRegistry();
     }
 
+    private BlockHash.CategorizeDef getCategorizeDef() {
+        return new BlockHash.CategorizeDef(null, randomFrom(BlockHash.CategorizeDef.OutputFormat.values()), 70);
+    }
+
     public void testCategorizeRaw() {
+        BlockHash.CategorizeDef categorizeDef = getCategorizeDef();
+
         final Page page;
         boolean withNull = randomBoolean();
         final int positions = 7 + (withNull ? 1 : 0);
@@ -98,7 +104,7 @@ public class CategorizeBlockHashTests extends BlockHashTestCase {
             page = new Page(builder.build());
         }
 
-        try (var hash = new CategorizeBlockHash(blockFactory, 0, AggregatorMode.SINGLE, analysisRegistry)) {
+        try (var hash = new CategorizeBlockHash(blockFactory, 0, AggregatorMode.SINGLE, categorizeDef, analysisRegistry)) {
             for (int i = randomInt(2); i < 3; i++) {
                 hash.add(page, new GroupingAggregatorFunction.AddInput() {
                     private void addBlock(int positionOffset, IntBlock groupIds) {
@@ -137,7 +143,10 @@ public class CategorizeBlockHashTests extends BlockHashTestCase {
                     }
                 });
 
-                assertHashState(hash, withNull, ".*?Connected.+?to.*?", ".*?Connection.+?error.*?", ".*?Disconnected.*?");
+                switch (categorizeDef.outputFormat()) {
+                    case REGEX -> assertHashState(hash, withNull, ".*?Connected.+?to.*?", ".*?Connection.+?error.*?", ".*?Disconnected.*?");
+                    case TOKENS -> assertHashState(hash, withNull, "Connected to", "Connection error", "Disconnected");
+                }
             }
         } finally {
             page.releaseBlocks();
@@ -145,6 +154,8 @@ public class CategorizeBlockHashTests extends BlockHashTestCase {
     }
 
     public void testCategorizeRawMultivalue() {
+        BlockHash.CategorizeDef categorizeDef = getCategorizeDef();
+
         final Page page;
         boolean withNull = randomBoolean();
         final int positions = 3 + (withNull ? 1 : 0);
@@ -170,7 +181,7 @@ public class CategorizeBlockHashTests extends BlockHashTestCase {
             page = new Page(builder.build());
         }
 
-        try (var hash = new CategorizeBlockHash(blockFactory, 0, AggregatorMode.SINGLE, analysisRegistry)) {
+        try (var hash = new CategorizeBlockHash(blockFactory, 0, AggregatorMode.SINGLE, categorizeDef, analysisRegistry)) {
             for (int i = randomInt(2); i < 3; i++) {
                 hash.add(page, new GroupingAggregatorFunction.AddInput() {
                     private void addBlock(int positionOffset, IntBlock groupIds) {
@@ -216,7 +227,10 @@ public class CategorizeBlockHashTests extends BlockHashTestCase {
                     }
                 });
 
-                assertHashState(hash, withNull, ".*?Connected.+?to.*?", ".*?Connection.+?error.*?", ".*?Disconnected.*?");
+                switch (categorizeDef.outputFormat()) {
+                    case REGEX -> assertHashState(hash, withNull, ".*?Connected.+?to.*?", ".*?Connection.+?error.*?", ".*?Disconnected.*?");
+                    case TOKENS -> assertHashState(hash, withNull, "Connected to", "Connection error", "Disconnected");
+                }
             }
         } finally {
             page.releaseBlocks();
@@ -224,6 +238,8 @@ public class CategorizeBlockHashTests extends BlockHashTestCase {
     }
 
     public void testCategorizeIntermediate() {
+        BlockHash.CategorizeDef categorizeDef = getCategorizeDef();
+
         Page page1;
         boolean withNull = randomBoolean();
         int positions1 = 7 + (withNull ? 1 : 0);
@@ -259,8 +275,8 @@ public class CategorizeBlockHashTests extends BlockHashTestCase {
 
         // Fill intermediatePages with the intermediate state from the raw hashes
         try (
-            BlockHash rawHash1 = new CategorizeBlockHash(blockFactory, 0, AggregatorMode.INITIAL, analysisRegistry);
-            BlockHash rawHash2 = new CategorizeBlockHash(blockFactory, 0, AggregatorMode.INITIAL, analysisRegistry);
+            BlockHash rawHash1 = new CategorizeBlockHash(blockFactory, 0, AggregatorMode.INITIAL, categorizeDef, analysisRegistry);
+            BlockHash rawHash2 = new CategorizeBlockHash(blockFactory, 0, AggregatorMode.INITIAL, categorizeDef, analysisRegistry);
         ) {
             rawHash1.add(page1, new GroupingAggregatorFunction.AddInput() {
                 private void addBlock(int positionOffset, IntBlock groupIds) {
@@ -335,7 +351,7 @@ public class CategorizeBlockHashTests extends BlockHashTestCase {
             page2.releaseBlocks();
         }
 
-        try (var intermediateHash = new CategorizeBlockHash(blockFactory, 0, AggregatorMode.FINAL, null)) {
+        try (var intermediateHash = new CategorizeBlockHash(blockFactory, 0, AggregatorMode.FINAL, categorizeDef, null)) {
             intermediateHash.add(intermediatePage1, new GroupingAggregatorFunction.AddInput() {
                 private void addBlock(int positionOffset, IntBlock groupIds) {
                     List<Integer> values = IntStream.range(0, groupIds.getPositionCount())
@@ -403,14 +419,24 @@ public class CategorizeBlockHashTests extends BlockHashTestCase {
                     }
                 });
 
-                assertHashState(
-                    intermediateHash,
-                    withNull,
-                    ".*?Connected.+?to.*?",
-                    ".*?Connection.+?error.*?",
-                    ".*?Disconnected.*?",
-                    ".*?System.+?shutdown.*?"
-                );
+                switch (categorizeDef.outputFormat()) {
+                    case REGEX -> assertHashState(
+                        intermediateHash,
+                        withNull,
+                        ".*?Connected.+?to.*?",
+                        ".*?Connection.+?error.*?",
+                        ".*?Disconnected.*?",
+                        ".*?System.+?shutdown.*?"
+                    );
+                    case TOKENS -> assertHashState(
+                        intermediateHash,
+                        withNull,
+                        "Connected to",
+                        "Connection error",
+                        "Disconnected",
+                        "System shutdown"
+                    );
+                }
             }
         } finally {
             intermediatePage1.releaseBlocks();
@@ -419,6 +445,9 @@ public class CategorizeBlockHashTests extends BlockHashTestCase {
     }
 
     public void testCategorize_withDriver() {
+        BlockHash.CategorizeDef categorizeDef = getCategorizeDef();
+        BlockHash.GroupSpec groupSpec = new BlockHash.GroupSpec(0, ElementType.BYTES_REF, categorizeDef);
+
         BigArrays bigArrays = new MockBigArrays(PageCacheRecycler.NON_RECYCLING_INSTANCE, ByteSizeValue.ofMb(256)).withCircuitBreaking();
         CircuitBreaker breaker = bigArrays.breakerService().getBreaker(CircuitBreaker.REQUEST);
         DriverContext driverContext = new DriverContext(bigArrays, new BlockFactory(breaker, bigArrays));
@@ -477,7 +506,7 @@ public class CategorizeBlockHashTests extends BlockHashTestCase {
             new LocalSourceOperator(input1),
             List.of(
                 new HashAggregationOperator.HashAggregationOperatorFactory(
-                    List.of(makeGroupSpec()),
+                    List.of(groupSpec),
                     AggregatorMode.INITIAL,
                     List.of(
                         new SumLongAggregatorFunctionSupplier().groupingAggregatorFactory(AggregatorMode.INITIAL, List.of(1)),
@@ -496,7 +525,7 @@ public class CategorizeBlockHashTests extends BlockHashTestCase {
             new LocalSourceOperator(input2),
             List.of(
                 new HashAggregationOperator.HashAggregationOperatorFactory(
-                    List.of(makeGroupSpec()),
+                    List.of(groupSpec),
                     AggregatorMode.INITIAL,
                     List.of(
                         new SumLongAggregatorFunctionSupplier().groupingAggregatorFactory(AggregatorMode.INITIAL, List.of(1)),
@@ -517,7 +546,7 @@ public class CategorizeBlockHashTests extends BlockHashTestCase {
             new CannedSourceOperator(intermediateOutput.iterator()),
             List.of(
                 new HashAggregationOperator.HashAggregationOperatorFactory(
-                    List.of(makeGroupSpec()),
+                    List.of(groupSpec),
                     AggregatorMode.FINAL,
                     List.of(
                         new SumLongAggregatorFunctionSupplier().groupingAggregatorFactory(AggregatorMode.FINAL, List.of(1, 2)),
@@ -544,23 +573,36 @@ public class CategorizeBlockHashTests extends BlockHashTestCase {
             sums.put(outputTexts.getBytesRef(i, new BytesRef()).utf8ToString(), outputSums.getLong(i));
             maxs.put(outputTexts.getBytesRef(i, new BytesRef()).utf8ToString(), outputMaxs.getLong(i));
         }
+        List<String> keys = switch (categorizeDef.outputFormat()) {
+            case REGEX -> List.of(
+                ".*?aaazz.*?",
+                ".*?bbbzz.*?",
+                ".*?ccczz.*?",
+                ".*?dddzz.*?",
+                ".*?eeezz.*?",
+                ".*?words.+?words.+?words.+?goodbye.*?",
+                ".*?words.+?words.+?words.+?hello.*?"
+            );
+            case TOKENS -> List.of("aaazz", "bbbzz", "ccczz", "dddzz", "eeezz", "words words words goodbye", "words words words hello");
+        };
+
         assertThat(
             sums,
             equalTo(
                 Map.of(
-                    ".*?aaazz.*?",
+                    keys.get(0),
                     1L,
-                    ".*?bbbzz.*?",
+                    keys.get(1),
                     2L,
-                    ".*?ccczz.*?",
+                    keys.get(2),
                     33L,
-                    ".*?dddzz.*?",
+                    keys.get(3),
                     44L,
-                    ".*?eeezz.*?",
+                    keys.get(4),
                     5L,
-                    ".*?words.+?words.+?words.+?goodbye.*?",
+                    keys.get(5),
                     8888L,
-                    ".*?words.+?words.+?words.+?hello.*?",
+                    keys.get(6),
                     999L
                 )
             )
@@ -569,19 +611,19 @@ public class CategorizeBlockHashTests extends BlockHashTestCase {
             maxs,
             equalTo(
                 Map.of(
-                    ".*?aaazz.*?",
+                    keys.get(0),
                     1L,
-                    ".*?bbbzz.*?",
+                    keys.get(1),
                     2L,
-                    ".*?ccczz.*?",
+                    keys.get(2),
                     30L,
-                    ".*?dddzz.*?",
+                    keys.get(3),
                     40L,
-                    ".*?eeezz.*?",
+                    keys.get(4),
                     5L,
-                    ".*?words.+?words.+?words.+?goodbye.*?",
+                    keys.get(5),
                     8000L,
-                    ".*?words.+?words.+?words.+?hello.*?",
+                    keys.get(6),
                     900L
                 )
             )
@@ -589,10 +631,6 @@ public class CategorizeBlockHashTests extends BlockHashTestCase {
         Releasables.close(() -> Iterators.map(finalOutput.iterator(), (Page p) -> p::releaseBlocks));
     }
 
-    private BlockHash.GroupSpec makeGroupSpec() {
-        return new BlockHash.GroupSpec(0, ElementType.BYTES_REF, true);
-    }
-
     private void assertHashState(CategorizeBlockHash hash, boolean withNull, String... expectedKeys) {
         // Check the keys
         Block[] blocks = null;

+ 14 - 5
x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/CategorizePackedValuesBlockHashTests.java

@@ -74,10 +74,15 @@ public class CategorizePackedValuesBlockHashTests extends BlockHashTestCase {
         DriverContext driverContext = new DriverContext(bigArrays, new BlockFactory(breaker, bigArrays));
         boolean withNull = randomBoolean();
         boolean withMultivalues = randomBoolean();
+        BlockHash.CategorizeDef categorizeDef = new BlockHash.CategorizeDef(
+            null,
+            randomFrom(BlockHash.CategorizeDef.OutputFormat.values()),
+            70
+        );
 
         List<BlockHash.GroupSpec> groupSpecs = List.of(
-            new BlockHash.GroupSpec(0, ElementType.BYTES_REF, true),
-            new BlockHash.GroupSpec(1, ElementType.INT, false)
+            new BlockHash.GroupSpec(0, ElementType.BYTES_REF, categorizeDef),
+            new BlockHash.GroupSpec(1, ElementType.INT, null)
         );
 
         LocalSourceOperator.BlockSupplier input1 = () -> {
@@ -218,8 +223,12 @@ public class CategorizePackedValuesBlockHashTests extends BlockHashTestCase {
         }
         Releasables.close(() -> Iterators.map(finalOutput.iterator(), (Page p) -> p::releaseBlocks));
 
+        List<String> keys = switch (categorizeDef.outputFormat()) {
+            case REGEX -> List.of(".*?connected.+?to.*?", ".*?connection.+?error.*?", ".*?disconnected.*?");
+            case TOKENS -> List.of("connected to", "connection error", "disconnected");
+        };
         Map<String, Map<Integer, Set<String>>> expectedResult = Map.of(
-            ".*?connected.+?to.*?",
+            keys.get(0),
             Map.of(
                 7,
                 Set.of("connected to 1.1.1", "connected to 1.1.2", "connected to 1.1.4", "connected to 2.1.2"),
@@ -228,9 +237,9 @@ public class CategorizePackedValuesBlockHashTests extends BlockHashTestCase {
                 111,
                 Set.of("connected to 2.1.1")
             ),
-            ".*?connection.+?error.*?",
+            keys.get(1),
             Map.of(7, Set.of("connection error"), 42, Set.of("connection error")),
-            ".*?disconnected.*?",
+            keys.get(2),
             Map.of(7, Set.of("disconnected"))
         );
         if (withNull) {

+ 1 - 1
x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/TopNBlockHashTests.java

@@ -363,7 +363,7 @@ public class TopNBlockHashTests extends BlockHashTestCase {
     private BlockHash buildBlockHash(int emitBatchSize, Block... values) {
         List<BlockHash.GroupSpec> specs = new ArrayList<>(values.length);
         for (int c = 0; c < values.length; c++) {
-            specs.add(new BlockHash.GroupSpec(c, values[c].elementType(), false, topNDef(c)));
+            specs.add(new BlockHash.GroupSpec(c, values[c].elementType(), null, topNDef(c)));
         }
         assert forcePackedHash == false : "Packed TopN hash not implemented yet";
         /*return forcePackedHash

+ 2 - 2
x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/operator/HashAggregationOperatorTests.java

@@ -113,7 +113,7 @@ public class HashAggregationOperatorTests extends ForkingOperatorTestCase {
 
         try (
             var operator = new HashAggregationOperator.HashAggregationOperatorFactory(
-                List.of(new BlockHash.GroupSpec(groupChannel, ElementType.LONG, false, new BlockHash.TopNDef(0, ascOrder, false, 3))),
+                List.of(new BlockHash.GroupSpec(groupChannel, ElementType.LONG, null, new BlockHash.TopNDef(0, ascOrder, false, 3))),
                 mode,
                 List.of(
                     new SumLongAggregatorFunctionSupplier().groupingAggregatorFactory(mode, aggregatorChannels),
@@ -190,7 +190,7 @@ public class HashAggregationOperatorTests extends ForkingOperatorTestCase {
 
         try (
             var operator = new HashAggregationOperator.HashAggregationOperatorFactory(
-                List.of(new BlockHash.GroupSpec(groupChannel, ElementType.LONG, false, new BlockHash.TopNDef(0, ascOrder, true, 3))),
+                List.of(new BlockHash.GroupSpec(groupChannel, ElementType.LONG, null, new BlockHash.TopNDef(0, ascOrder, true, 3))),
                 mode,
                 List.of(
                     new SumLongAggregatorFunctionSupplier().groupingAggregatorFactory(mode, aggregatorChannels),

+ 80 - 1
x-pack/plugin/esql/qa/testFixtures/src/main/resources/categorize.csv-spec

@@ -397,7 +397,7 @@ FROM sample_data
 ;
 
 COUNT():long | SUM(event_duration):long | category:keyword
-           7 |                 23231327 |  null
+           7 |                 23231327 | null
 ;
 
 on null row
@@ -800,3 +800,82 @@ COUNT():long | VALUES(str):keyword | category:keyword | str:keyword
            1 | [a, b, c]           | null             | b
            1 | [a, b, c]           | null             | c
 ;
+
+with option output_format regex
+required_capability: categorize_options
+
+FROM sample_data
+  | STATS count=COUNT()
+       BY category=CATEGORIZE(message, {"output_format": "regex"})
+  | SORT count DESC, category
+;
+
+count:long | category:keyword
+         3 | .*?Connected.+?to.*?
+         3 | .*?Connection.+?error.*?
+         1 |  .*?Disconnected.*?
+;
+
+with option output_format tokens
+required_capability: categorize_options
+
+FROM sample_data
+  | STATS count=COUNT()
+       BY category=CATEGORIZE(message, {"output_format": "tokens"})
+  | SORT count DESC, category
+;
+
+count:long | category:keyword
+         3 | Connected to
+         3 | Connection error
+         1 | Disconnected
+;
+
+with option similarity_threshold
+required_capability: categorize_options
+
+FROM sample_data
+  | STATS count=COUNT()
+       BY category=CATEGORIZE(message, {"similarity_threshold": 99})
+  | SORT count DESC, category
+;
+
+count:long | category:keyword
+3          | .*?Connection.+?error.*?
+1          | .*?Connected.+?to.+?10\.1\.0\.1.*?
+1          | .*?Connected.+?to.+?10\.1\.0\.2.*?
+1          | .*?Connected.+?to.+?10\.1\.0\.3.*?
+1          | .*?Disconnected.*?
+;
+
+with option analyzer
+required_capability: categorize_options
+
+FROM sample_data
+  | STATS count=COUNT()
+       BY category=CATEGORIZE(message, {"analyzer": "stop"})
+  | SORT count DESC, category
+;
+
+count:long | category:keyword
+3          | .*?connected.*?
+3          | .*?connection.+?error.*?
+1          | .*?disconnected.*?
+;
+
+with all options
+required_capability: categorize_options
+
+FROM sample_data
+  | STATS count=COUNT()
+       BY category=CATEGORIZE(message, {"analyzer": "whitespace", "similarity_threshold": 100, "output_format": "tokens"})
+  | SORT count DESC, category
+;
+
+count:long | category:keyword
+3          | Connection error
+1          | Connected to 10.1.0.1
+1          | Connected to 10.1.0.2
+1          | Connected to 10.1.0.3
+1          | Disconnected
+;

+ 9 - 2
x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java

@@ -1254,10 +1254,12 @@ public class EsqlCapabilities {
          * FUSE command
          */
         FUSE(Build.current().isSnapshot()),
+
         /**
          * Support improved behavior for LIKE operator when used with index fields.
          */
         LIKE_ON_INDEX_FIELDS,
+
         /**
          * Support avg with aggregate metric doubles
          */
@@ -1274,10 +1276,15 @@ public class EsqlCapabilities {
          */
         FAIL_IF_ALL_SHARDS_FAIL(Build.current().isSnapshot()),
 
-        /*
+        /**
          * Cosine vector similarity function
          */
-        COSINE_VECTOR_SIMILARITY_FUNCTION(Build.current().isSnapshot());
+        COSINE_VECTOR_SIMILARITY_FUNCTION(Build.current().isSnapshot()),
+
+        /**
+         * Support for the options field of CATEGORIZE.
+         */
+        CATEGORIZE_OPTIONS;
 
         private final boolean enabled;
 

+ 107 - 0
x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/Options.java

@@ -0,0 +1,107 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+package org.elasticsearch.xpack.esql.expression.function;
+
+import org.apache.lucene.util.BytesRef;
+import org.elasticsearch.xpack.esql.core.InvalidArgumentException;
+import org.elasticsearch.xpack.esql.core.expression.EntryExpression;
+import org.elasticsearch.xpack.esql.core.expression.Expression;
+import org.elasticsearch.xpack.esql.core.expression.Literal;
+import org.elasticsearch.xpack.esql.core.expression.MapExpression;
+import org.elasticsearch.xpack.esql.core.expression.TypeResolutions;
+import org.elasticsearch.xpack.esql.core.tree.Source;
+import org.elasticsearch.xpack.esql.core.type.DataType;
+import org.elasticsearch.xpack.esql.core.type.DataTypeConverter;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.function.Consumer;
+
+import static org.elasticsearch.common.logging.LoggerMessageFormat.format;
+import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isFoldable;
+import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isMapExpression;
+import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isNotNull;
+
+public class Options {
+
+    public static Expression.TypeResolution resolve(
+        Expression options,
+        Source source,
+        TypeResolutions.ParamOrdinal paramOrdinal,
+        Map<String, DataType> allowedOptions
+    ) {
+        return resolve(options, source, paramOrdinal, allowedOptions, null);
+    }
+
+    public static Expression.TypeResolution resolve(
+        Expression options,
+        Source source,
+        TypeResolutions.ParamOrdinal paramOrdinal,
+        Map<String, DataType> allowedOptions,
+        Consumer<Map<String, Object>> verifyOptions
+    ) {
+        if (options != null) {
+            Expression.TypeResolution resolution = isNotNull(options, source.text(), paramOrdinal);
+            if (resolution.unresolved()) {
+                return resolution;
+            }
+            // MapExpression does not have a DataType associated with it
+            resolution = isMapExpression(options, source.text(), paramOrdinal);
+            if (resolution.unresolved()) {
+                return resolution;
+            }
+            try {
+                Map<String, Object> optionsMap = new HashMap<>();
+                populateMap((MapExpression) options, optionsMap, source, paramOrdinal, allowedOptions);
+                if (verifyOptions != null) {
+                    verifyOptions.accept(optionsMap);
+                }
+            } catch (InvalidArgumentException e) {
+                return new Expression.TypeResolution(e.getMessage());
+            }
+        }
+        return Expression.TypeResolution.TYPE_RESOLVED;
+    }
+
+    public static void populateMap(
+        final MapExpression options,
+        final Map<String, Object> optionsMap,
+        final Source source,
+        final TypeResolutions.ParamOrdinal paramOrdinal,
+        final Map<String, DataType> allowedOptions
+    ) throws InvalidArgumentException {
+        for (EntryExpression entry : options.entryExpressions()) {
+            Expression optionExpr = entry.key();
+            Expression valueExpr = entry.value();
+            Expression.TypeResolution resolution = isFoldable(optionExpr, source.text(), paramOrdinal).and(
+                isFoldable(valueExpr, source.text(), paramOrdinal)
+            );
+            if (resolution.unresolved()) {
+                throw new InvalidArgumentException(resolution.message());
+            }
+            Object optionExprLiteral = ((Literal) optionExpr).value();
+            Object valueExprLiteral = ((Literal) valueExpr).value();
+            String optionName = optionExprLiteral instanceof BytesRef br ? br.utf8ToString() : optionExprLiteral.toString();
+            String optionValue = valueExprLiteral instanceof BytesRef br ? br.utf8ToString() : valueExprLiteral.toString();
+            // validate the optionExpr is supported
+            DataType dataType = allowedOptions.get(optionName);
+            if (dataType == null) {
+                throw new InvalidArgumentException(
+                    format(null, "Invalid option [{}] in [{}], expected one of {}", optionName, source.text(), allowedOptions.keySet())
+                );
+            }
+            try {
+                optionsMap.put(optionName, DataTypeConverter.convert(optionValue, dataType));
+            } catch (InvalidArgumentException e) {
+                throw new InvalidArgumentException(
+                    format(null, "Invalid option [{}] in [{}], {}", optionName, source.text(), e.getMessage())
+                );
+            }
+        }
+    }
+}

+ 0 - 71
x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/fulltext/FullTextFunction.java

@@ -7,7 +7,6 @@
 
 package org.elasticsearch.xpack.esql.expression.function.fulltext;
 
-import org.apache.lucene.util.BytesRef;
 import org.elasticsearch.common.lucene.BytesRefs;
 import org.elasticsearch.compute.lucene.LuceneQueryEvaluator.ShardConfig;
 import org.elasticsearch.compute.lucene.LuceneQueryExpressionEvaluator;
@@ -20,20 +19,15 @@ import org.elasticsearch.xpack.esql.action.EsqlCapabilities;
 import org.elasticsearch.xpack.esql.capabilities.PostAnalysisPlanVerificationAware;
 import org.elasticsearch.xpack.esql.capabilities.TranslationAware;
 import org.elasticsearch.xpack.esql.common.Failures;
-import org.elasticsearch.xpack.esql.core.InvalidArgumentException;
-import org.elasticsearch.xpack.esql.core.expression.EntryExpression;
 import org.elasticsearch.xpack.esql.core.expression.Expression;
 import org.elasticsearch.xpack.esql.core.expression.FieldAttribute;
 import org.elasticsearch.xpack.esql.core.expression.FoldContext;
-import org.elasticsearch.xpack.esql.core.expression.Literal;
-import org.elasticsearch.xpack.esql.core.expression.MapExpression;
 import org.elasticsearch.xpack.esql.core.expression.Nullability;
 import org.elasticsearch.xpack.esql.core.expression.TypeResolutions;
 import org.elasticsearch.xpack.esql.core.expression.function.Function;
 import org.elasticsearch.xpack.esql.core.querydsl.query.Query;
 import org.elasticsearch.xpack.esql.core.tree.Source;
 import org.elasticsearch.xpack.esql.core.type.DataType;
-import org.elasticsearch.xpack.esql.core.type.DataTypeConverter;
 import org.elasticsearch.xpack.esql.core.type.MultiTypeEsField;
 import org.elasticsearch.xpack.esql.evaluator.mapper.EvaluatorMapper;
 import org.elasticsearch.xpack.esql.expression.function.scalar.convert.AbstractConvertFunction;
@@ -55,17 +49,12 @@ import org.elasticsearch.xpack.esql.score.ExpressionScoreMapper;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Locale;
-import java.util.Map;
 import java.util.Objects;
 import java.util.function.BiConsumer;
 import java.util.function.Predicate;
 
-import static org.elasticsearch.common.logging.LoggerMessageFormat.format;
 import static org.elasticsearch.xpack.esql.common.Failure.fail;
 import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.DEFAULT;
-import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isFoldable;
-import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isMapExpression;
-import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isNotNull;
 import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isNotNullAndFoldable;
 import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isString;
 
@@ -409,66 +398,6 @@ public abstract class FullTextFunction extends Function
         return new LuceneQueryScoreEvaluator.Factory(shardConfigs);
     }
 
-    protected static void populateOptionsMap(
-        final MapExpression options,
-        final Map<String, Object> optionsMap,
-        final TypeResolutions.ParamOrdinal paramOrdinal,
-        final String sourceText,
-        final Map<String, DataType> allowedOptions
-    ) throws InvalidArgumentException {
-        for (EntryExpression entry : options.entryExpressions()) {
-            Expression optionExpr = entry.key();
-            Expression valueExpr = entry.value();
-            TypeResolution resolution = isFoldable(optionExpr, sourceText, paramOrdinal).and(
-                isFoldable(valueExpr, sourceText, paramOrdinal)
-            );
-            if (resolution.unresolved()) {
-                throw new InvalidArgumentException(resolution.message());
-            }
-            Object optionExprLiteral = ((Literal) optionExpr).value();
-            Object valueExprLiteral = ((Literal) valueExpr).value();
-            String optionName = optionExprLiteral instanceof BytesRef br ? br.utf8ToString() : optionExprLiteral.toString();
-            String optionValue = valueExprLiteral instanceof BytesRef br ? br.utf8ToString() : valueExprLiteral.toString();
-            // validate the optionExpr is supported
-            DataType dataType = allowedOptions.get(optionName);
-            if (dataType == null) {
-                throw new InvalidArgumentException(
-                    format(null, "Invalid option [{}] in [{}], expected one of {}", optionName, sourceText, allowedOptions.keySet())
-                );
-            }
-            try {
-                optionsMap.put(optionName, DataTypeConverter.convert(optionValue, dataType));
-            } catch (InvalidArgumentException e) {
-                throw new InvalidArgumentException(format(null, "Invalid option [{}] in [{}], {}", optionName, sourceText, e.getMessage()));
-            }
-        }
-    }
-
-    protected TypeResolution resolveOptions(Expression options, TypeResolutions.ParamOrdinal paramOrdinal) {
-        if (options != null) {
-            TypeResolution resolution = isNotNull(options, sourceText(), paramOrdinal);
-            if (resolution.unresolved()) {
-                return resolution;
-            }
-            // MapExpression does not have a DataType associated with it
-            resolution = isMapExpression(options, sourceText(), paramOrdinal);
-            if (resolution.unresolved()) {
-                return resolution;
-            }
-
-            try {
-                resolvedOptions();
-            } catch (InvalidArgumentException e) {
-                return new TypeResolution(e.getMessage());
-            }
-        }
-        return TypeResolution.TYPE_RESOLVED;
-    }
-
-    protected Map<String, Object> resolvedOptions() throws InvalidArgumentException {
-        return Map.of();
-    }
-
     // TODO: this should likely be replaced by calls to FieldAttribute#fieldName; the MultiTypeEsField case looks
     // wrong if `fieldAttribute` is a subfield, e.g. `parent.child` - multiTypeEsField#getName will just return `child`.
     public static String getNameFromFieldAttribute(FieldAttribute fieldAttribute) {

+ 5 - 7
x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/fulltext/Match.java

@@ -33,6 +33,7 @@ import org.elasticsearch.xpack.esql.expression.function.FunctionAppliesToLifecyc
 import org.elasticsearch.xpack.esql.expression.function.FunctionInfo;
 import org.elasticsearch.xpack.esql.expression.function.MapParam;
 import org.elasticsearch.xpack.esql.expression.function.OptionalArgument;
+import org.elasticsearch.xpack.esql.expression.function.Options;
 import org.elasticsearch.xpack.esql.expression.function.Param;
 import org.elasticsearch.xpack.esql.io.stream.PlanStreamInput;
 import org.elasticsearch.xpack.esql.optimizer.rules.physical.local.LucenePushdownPredicates;
@@ -298,7 +299,9 @@ public class Match extends FullTextFunction implements OptionalArgument, PostAna
 
     @Override
     protected TypeResolution resolveParams() {
-        return resolveField().and(resolveQuery()).and(resolveOptions(options(), THIRD)).and(checkParamCompatibility());
+        return resolveField().and(resolveQuery())
+            .and(Options.resolve(options(), source(), THIRD, ALLOWED_OPTIONS))
+            .and(checkParamCompatibility());
     }
 
     private TypeResolution resolveField() {
@@ -342,11 +345,6 @@ public class Match extends FullTextFunction implements OptionalArgument, PostAna
         return new TypeResolution(formatIncompatibleTypesMessage(fieldType, queryType, sourceText()));
     }
 
-    @Override
-    protected Map<String, Object> resolvedOptions() {
-        return matchQueryOptions();
-    }
-
     private Map<String, Object> matchQueryOptions() throws InvalidArgumentException {
         if (options() == null) {
             return Map.of(LENIENT_FIELD.getPreferredName(), true);
@@ -356,7 +354,7 @@ public class Match extends FullTextFunction implements OptionalArgument, PostAna
         // Match is lenient by default to avoid failing on incompatible types
         matchOptions.put(LENIENT_FIELD.getPreferredName(), true);
 
-        populateOptionsMap((MapExpression) options(), matchOptions, SECOND, sourceText(), ALLOWED_OPTIONS);
+        Options.populateMap((MapExpression) options(), matchOptions, source(), SECOND, ALLOWED_OPTIONS);
         return matchOptions;
     }
 

+ 3 - 7
x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/fulltext/MatchPhrase.java

@@ -30,6 +30,7 @@ import org.elasticsearch.xpack.esql.expression.function.FunctionAppliesToLifecyc
 import org.elasticsearch.xpack.esql.expression.function.FunctionInfo;
 import org.elasticsearch.xpack.esql.expression.function.MapParam;
 import org.elasticsearch.xpack.esql.expression.function.OptionalArgument;
+import org.elasticsearch.xpack.esql.expression.function.Options;
 import org.elasticsearch.xpack.esql.expression.function.Param;
 import org.elasticsearch.xpack.esql.io.stream.PlanStreamInput;
 import org.elasticsearch.xpack.esql.optimizer.rules.physical.local.LucenePushdownPredicates;
@@ -187,7 +188,7 @@ public class MatchPhrase extends FullTextFunction implements OptionalArgument, P
 
     @Override
     protected TypeResolution resolveParams() {
-        return resolveField().and(resolveQuery()).and(resolveOptions(options(), THIRD));
+        return resolveField().and(resolveQuery()).and(Options.resolve(options(), source(), THIRD, ALLOWED_OPTIONS));
     }
 
     private TypeResolution resolveField() {
@@ -200,18 +201,13 @@ public class MatchPhrase extends FullTextFunction implements OptionalArgument, P
         );
     }
 
-    @Override
-    protected Map<String, Object> resolvedOptions() throws InvalidArgumentException {
-        return matchPhraseQueryOptions();
-    }
-
     private Map<String, Object> matchPhraseQueryOptions() throws InvalidArgumentException {
         if (options() == null) {
             return Map.of();
         }
 
         Map<String, Object> matchPhraseOptions = new HashMap<>();
-        populateOptionsMap((MapExpression) options(), matchPhraseOptions, SECOND, sourceText(), ALLOWED_OPTIONS);
+        Options.populateMap((MapExpression) options(), matchPhraseOptions, source(), SECOND, ALLOWED_OPTIONS);
         return matchPhraseOptions;
     }
 

+ 2 - 1
x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/fulltext/MultiMatch.java

@@ -29,6 +29,7 @@ import org.elasticsearch.xpack.esql.expression.function.FunctionAppliesToLifecyc
 import org.elasticsearch.xpack.esql.expression.function.FunctionInfo;
 import org.elasticsearch.xpack.esql.expression.function.MapParam;
 import org.elasticsearch.xpack.esql.expression.function.OptionalArgument;
+import org.elasticsearch.xpack.esql.expression.function.Options;
 import org.elasticsearch.xpack.esql.expression.function.Param;
 import org.elasticsearch.xpack.esql.io.stream.PlanStreamInput;
 import org.elasticsearch.xpack.esql.optimizer.rules.physical.local.LucenePushdownPredicates;
@@ -368,7 +369,7 @@ public class MultiMatch extends FullTextFunction implements OptionalArgument, Po
             return options;
         }
 
-        Match.populateOptionsMap((MapExpression) options(), options, THIRD, sourceText(), OPTIONS);
+        Options.populateMap((MapExpression) options(), options, source(), THIRD, OPTIONS);
         return options;
     }
 

+ 3 - 7
x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/fulltext/QueryString.java

@@ -26,6 +26,7 @@ import org.elasticsearch.xpack.esql.expression.function.FunctionAppliesToLifecyc
 import org.elasticsearch.xpack.esql.expression.function.FunctionInfo;
 import org.elasticsearch.xpack.esql.expression.function.MapParam;
 import org.elasticsearch.xpack.esql.expression.function.OptionalArgument;
+import org.elasticsearch.xpack.esql.expression.function.Options;
 import org.elasticsearch.xpack.esql.expression.function.Param;
 import org.elasticsearch.xpack.esql.io.stream.PlanStreamInput;
 import org.elasticsearch.xpack.esql.optimizer.rules.physical.local.LucenePushdownPredicates;
@@ -321,18 +322,13 @@ public class QueryString extends FullTextFunction implements OptionalArgument {
         }
 
         Map<String, Object> matchOptions = new HashMap<>();
-        populateOptionsMap((MapExpression) options(), matchOptions, SECOND, sourceText(), ALLOWED_OPTIONS);
+        Options.populateMap((MapExpression) options(), matchOptions, source(), SECOND, ALLOWED_OPTIONS);
         return matchOptions;
     }
 
-    @Override
-    protected Map<String, Object> resolvedOptions() {
-        return queryStringOptions();
-    }
-
     @Override
     protected TypeResolution resolveParams() {
-        return resolveQuery().and(resolveOptions(options(), SECOND));
+        return resolveQuery().and(Options.resolve(options(), source(), SECOND, ALLOWED_OPTIONS));
     }
 
     @Override

+ 111 - 8
x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/grouping/Categorize.java

@@ -7,13 +7,18 @@
 
 package org.elasticsearch.xpack.esql.expression.function.grouping;
 
+import org.elasticsearch.TransportVersions;
 import org.elasticsearch.common.io.stream.NamedWriteableRegistry;
 import org.elasticsearch.common.io.stream.StreamInput;
 import org.elasticsearch.common.io.stream.StreamOutput;
+import org.elasticsearch.compute.aggregation.blockhash.BlockHash.CategorizeDef;
+import org.elasticsearch.compute.aggregation.blockhash.BlockHash.CategorizeDef.OutputFormat;
 import org.elasticsearch.license.XPackLicenseState;
 import org.elasticsearch.xpack.esql.LicenseAware;
 import org.elasticsearch.xpack.esql.SupportsObservabilityTier;
+import org.elasticsearch.xpack.esql.core.InvalidArgumentException;
 import org.elasticsearch.xpack.esql.core.expression.Expression;
+import org.elasticsearch.xpack.esql.core.expression.MapExpression;
 import org.elasticsearch.xpack.esql.core.expression.Nullability;
 import org.elasticsearch.xpack.esql.core.tree.NodeInfo;
 import org.elasticsearch.xpack.esql.core.tree.Source;
@@ -21,16 +26,29 @@ import org.elasticsearch.xpack.esql.core.type.DataType;
 import org.elasticsearch.xpack.esql.expression.function.Example;
 import org.elasticsearch.xpack.esql.expression.function.FunctionInfo;
 import org.elasticsearch.xpack.esql.expression.function.FunctionType;
+import org.elasticsearch.xpack.esql.expression.function.MapParam;
+import org.elasticsearch.xpack.esql.expression.function.OptionalArgument;
+import org.elasticsearch.xpack.esql.expression.function.Options;
 import org.elasticsearch.xpack.esql.expression.function.Param;
 import org.elasticsearch.xpack.esql.io.stream.PlanStreamInput;
 import org.elasticsearch.xpack.ml.MachineLearning;
 
 import java.io.IOException;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.TreeMap;
 
+import static java.util.Map.entry;
+import static org.elasticsearch.common.logging.LoggerMessageFormat.format;
+import static org.elasticsearch.compute.aggregation.blockhash.BlockHash.CategorizeDef.OutputFormat.REGEX;
 import static org.elasticsearch.xpack.esql.SupportsObservabilityTier.ObservabilityTier.COMPLETE;
 import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.DEFAULT;
+import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.SECOND;
 import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isString;
+import static org.elasticsearch.xpack.esql.core.type.DataType.INTEGER;
+import static org.elasticsearch.xpack.esql.core.type.DataType.KEYWORD;
 
 /**
  * Categorizes text messages.
@@ -42,14 +60,23 @@ import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isStr
  * </p>
  */
 @SupportsObservabilityTier(tier = COMPLETE)
-public class Categorize extends GroupingFunction.NonEvaluatableGroupingFunction implements LicenseAware {
+public class Categorize extends GroupingFunction.NonEvaluatableGroupingFunction implements OptionalArgument, LicenseAware {
     public static final NamedWriteableRegistry.Entry ENTRY = new NamedWriteableRegistry.Entry(
         Expression.class,
         "Categorize",
         Categorize::new
     );
 
+    private static final String ANALYZER = "analyzer";
+    private static final String OUTPUT_FORMAT = "output_format";
+    private static final String SIMILARITY_THRESHOLD = "similarity_threshold";
+
+    private static final Map<String, DataType> ALLOWED_OPTIONS = new TreeMap<>(
+        Map.ofEntries(entry(ANALYZER, KEYWORD), entry(OUTPUT_FORMAT, KEYWORD), entry(SIMILARITY_THRESHOLD, INTEGER))
+    );
+
     private final Expression field;
+    private final Expression options;
 
     @FunctionInfo(
         returnType = "keyword",
@@ -70,21 +97,56 @@ public class Categorize extends GroupingFunction.NonEvaluatableGroupingFunction
     )
     public Categorize(
         Source source,
-        @Param(name = "field", type = { "text", "keyword" }, description = "Expression to categorize") Expression field
-
+        @Param(name = "field", type = { "text", "keyword" }, description = "Expression to categorize") Expression field,
+        @MapParam(
+            name = "options",
+            description = "(Optional) Categorize additional options as <<esql-function-named-params,function named parameters>>.",
+            params = {
+                @MapParam.MapParamEntry(
+                    name = ANALYZER,
+                    type = "keyword",
+                    valueHint = { "standard" },
+                    description = "Analyzer used to convert the field into tokens for text categorization."
+                ),
+                @MapParam.MapParamEntry(
+                    name = OUTPUT_FORMAT,
+                    type = "keyword",
+                    valueHint = { "regex", "tokens" },
+                    description = "The output format of the categories. Defaults to regex."
+                ),
+                @MapParam.MapParamEntry(
+                    name = SIMILARITY_THRESHOLD,
+                    type = "integer",
+                    valueHint = { "70" },
+                    description = "The minimum percentage of token weight that must match for text to be added to the category bucket. "
+                        + "Must be between 1 and 100. The larger the value the narrower the categories. "
+                        + "Larger values will increase memory usage and create narrower categories. Defaults to 70."
+                ), },
+            optional = true
+        ) Expression options
     ) {
-        super(source, List.of(field));
+        super(source, options == null ? List.of(field) : List.of(field, options));
         this.field = field;
+        this.options = options;
     }
 
     private Categorize(StreamInput in) throws IOException {
-        this(Source.readFrom((PlanStreamInput) in), in.readNamedWriteable(Expression.class));
+        this(
+            Source.readFrom((PlanStreamInput) in),
+            in.readNamedWriteable(Expression.class),
+            in.getTransportVersion().onOrAfter(TransportVersions.ESQL_CATEGORIZE_OPTIONS)
+                ? in.readOptionalNamedWriteable(Expression.class)
+                : null
+        );
     }
 
     @Override
     public void writeTo(StreamOutput out) throws IOException {
         source().writeTo(out);
         out.writeNamedWriteable(field);
+        if (out.getTransportVersion().onOrAfter(TransportVersions.ESQL_CATEGORIZE_OPTIONS)) {
+            out.writeOptionalNamedWriteable(options);
+        }
     }
 
     @Override
@@ -107,7 +169,48 @@ public class Categorize extends GroupingFunction.NonEvaluatableGroupingFunction
 
     @Override
     protected TypeResolution resolveType() {
-        return isString(field(), sourceText(), DEFAULT);
+        return isString(field(), sourceText(), DEFAULT).and(
+            Options.resolve(options, source(), SECOND, ALLOWED_OPTIONS, this::verifyOptions)
+        );
+    }
+
+    private void verifyOptions(Map<String, Object> optionsMap) {
+        if (options == null) {
+            return;
+        }
+        Integer similarityThreshold = (Integer) optionsMap.get(SIMILARITY_THRESHOLD);
+        if (similarityThreshold != null) {
+            if (similarityThreshold <= 0 || similarityThreshold > 100) {
+                throw new InvalidArgumentException(
+                    format("invalid similarity threshold [{}], expecting a number between 1 and 100, inclusive", similarityThreshold)
+                );
+            }
+        }
+        String outputFormat = (String) optionsMap.get(OUTPUT_FORMAT);
+        if (outputFormat != null) {
+            try {
+                OutputFormat.valueOf(outputFormat.toUpperCase(Locale.ROOT));
+            } catch (IllegalArgumentException e) {
+                throw new InvalidArgumentException(
+                    format(null, "invalid output format [{}], expecting one of [REGEX, TOKENS]", outputFormat)
+                );
+            }
+        }
+    }
+
+    public CategorizeDef categorizeDef() {
+        Map<String, Object> optionsMap = new HashMap<>();
+        if (options != null) {
+            Options.populateMap((MapExpression) options, optionsMap, source(), SECOND, ALLOWED_OPTIONS);
+        }
+        Integer similarityThreshold = (Integer) optionsMap.get(SIMILARITY_THRESHOLD);
+        String outputFormatString = (String) optionsMap.get(OUTPUT_FORMAT);
+        OutputFormat outputFormat = outputFormatString == null ? null : OutputFormat.valueOf(outputFormatString.toUpperCase(Locale.ROOT));
+        return new CategorizeDef(
+            (String) optionsMap.get("analyzer"),
+            outputFormat == null ? REGEX : outputFormat,
+            similarityThreshold == null ? 70 : similarityThreshold
+        );
     }
 
     @Override
@@ -117,12 +220,12 @@ public class Categorize extends GroupingFunction.NonEvaluatableGroupingFunction
 
     @Override
     public Categorize replaceChildren(List<Expression> newChildren) {
-        return new Categorize(source(), newChildren.get(0));
+        return new Categorize(source(), newChildren.get(0), newChildren.size() > 1 ? newChildren.get(1) : null);
     }
 
     @Override
     protected NodeInfo<? extends Expression> info() {
-        return NodeInfo.create(this, Categorize::new, field);
+        return NodeInfo.create(this, Categorize::new, field, options);
     }
 
     public Expression field() {

+ 4 - 34
x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/vector/Knn.java

@@ -30,6 +30,7 @@ import org.elasticsearch.xpack.esql.expression.function.FunctionAppliesToLifecyc
 import org.elasticsearch.xpack.esql.expression.function.FunctionInfo;
 import org.elasticsearch.xpack.esql.expression.function.MapParam;
 import org.elasticsearch.xpack.esql.expression.function.OptionalArgument;
+import org.elasticsearch.xpack.esql.expression.function.Options;
 import org.elasticsearch.xpack.esql.expression.function.Param;
 import org.elasticsearch.xpack.esql.expression.function.fulltext.FullTextFunction;
 import org.elasticsearch.xpack.esql.expression.function.fulltext.Match;
@@ -53,10 +54,10 @@ import static org.elasticsearch.search.vectors.KnnVectorQueryBuilder.K_FIELD;
 import static org.elasticsearch.search.vectors.KnnVectorQueryBuilder.NUM_CANDS_FIELD;
 import static org.elasticsearch.search.vectors.KnnVectorQueryBuilder.VECTOR_SIMILARITY_FIELD;
 import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.FIRST;
+import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.FOURTH;
 import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.SECOND;
 import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.THIRD;
 import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isFoldable;
-import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isMapExpression;
 import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isNotNull;
 import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isNotNullAndFoldable;
 import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isType;
@@ -198,7 +199,7 @@ public class Knn extends FullTextFunction implements OptionalArgument, VectorFun
 
     @Override
     protected TypeResolution resolveParams() {
-        return resolveField().and(resolveQuery()).and(resolveK()).and(resolveOptions());
+        return resolveField().and(resolveQuery()).and(resolveK()).and(Options.resolve(options(), source(), FOURTH, ALLOWED_OPTIONS));
     }
 
     private TypeResolution resolveField() {
@@ -221,37 +222,6 @@ public class Knn extends FullTextFunction implements OptionalArgument, VectorFun
             .and(isNotNull(k(), sourceText(), THIRD));
     }
 
-    private TypeResolution resolveOptions() {
-        if (options() != null) {
-            TypeResolution resolution = isNotNull(options(), sourceText(), TypeResolutions.ParamOrdinal.FOURTH);
-            if (resolution.unresolved()) {
-                return resolution;
-            }
-            // MapExpression does not have a DataType associated with it
-            resolution = isMapExpression(options(), sourceText(), TypeResolutions.ParamOrdinal.FOURTH);
-            if (resolution.unresolved()) {
-                return resolution;
-            }
-
-            try {
-                knnQueryOptions();
-            } catch (InvalidArgumentException e) {
-                return new TypeResolution(e.getMessage());
-            }
-        }
-        return TypeResolution.TYPE_RESOLVED;
-    }
-
-    private Map<String, Object> knnQueryOptions() throws InvalidArgumentException {
-        if (options() == null) {
-            return Map.of();
-        }
-
-        Map<String, Object> matchOptions = new HashMap<>();
-        populateOptionsMap((MapExpression) options(), matchOptions, TypeResolutions.ParamOrdinal.FOURTH, sourceText(), ALLOWED_OPTIONS);
-        return matchOptions;
-    }
-
     @Override
     public Expression replaceQueryBuilder(QueryBuilder queryBuilder) {
         return new Knn(source(), field(), query(), k(), options(), queryBuilder, filterExpressions());
@@ -307,7 +277,7 @@ public class Knn extends FullTextFunction implements OptionalArgument, VectorFun
     private Map<String, Object> queryOptions() throws InvalidArgumentException {
         Map<String, Object> options = new HashMap<>();
         if (options() != null) {
-            populateOptionsMap((MapExpression) options(), options, TypeResolutions.ParamOrdinal.FOURTH, sourceText(), ALLOWED_OPTIONS);
+            Options.populateMap((MapExpression) options(), options, source(), FOURTH, ALLOWED_OPTIONS);
         }
         return options;
     }

+ 4 - 3
x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/ReplaceAggregateNestedExpressionWithEval.java

@@ -10,6 +10,7 @@ package org.elasticsearch.xpack.esql.optimizer.rules.logical;
 import org.elasticsearch.xpack.esql.core.expression.Alias;
 import org.elasticsearch.xpack.esql.core.expression.Attribute;
 import org.elasticsearch.xpack.esql.core.expression.Expression;
+import org.elasticsearch.xpack.esql.core.expression.MapExpression;
 import org.elasticsearch.xpack.esql.core.expression.NamedExpression;
 import org.elasticsearch.xpack.esql.core.util.Holder;
 import org.elasticsearch.xpack.esql.expression.function.aggregate.AggregateFunction;
@@ -137,13 +138,13 @@ public final class ReplaceAggregateNestedExpressionWithEval extends OptimizerRul
         List<Expression> newChildren = new ArrayList<>(gf.children().size());
 
         for (Expression ex : gf.children()) {
-            if (ex instanceof Attribute == false) { // TODO: foldables shouldn't require eval'ing either
+            if (ex instanceof Attribute || ex instanceof MapExpression) {
+                newChildren.add(ex);
+            } else { // TODO: foldables shouldn't require eval'ing either
                 var alias = new Alias(ex.source(), syntheticName(ex, gf, counter++), ex, null, true);
                 evals.add(alias);
                 newChildren.add(alias.toAttribute());
                 childrenChanged = true;
-            } else {
-                newChildren.add(ex);
             }
         }
 

+ 6 - 2
x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/AbstractPhysicalOperationProviders.java

@@ -343,8 +343,12 @@ public abstract class AbstractPhysicalOperationProviders implements PhysicalOper
             if (channel == null) {
                 throw new EsqlIllegalArgumentException("planned to use ordinals but tried to use the hash instead");
             }
-
-            return new BlockHash.GroupSpec(channel, elementType(), Alias.unwrap(expression) instanceof Categorize, null);
+            return new BlockHash.GroupSpec(
+                channel,
+                elementType(),
+                Alias.unwrap(expression) instanceof Categorize categorize ? categorize.categorizeDef() : null,
+                null
+            );
         }
 
         ElementType elementType() {

+ 51 - 0
x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java

@@ -1972,6 +1972,57 @@ public class VerifierTests extends ESTestCase {
         );
     }
 
+    public void testCategorizeInvalidOptionsField() {
+        assumeTrue("categorize options must be enabled", EsqlCapabilities.Cap.CATEGORIZE_OPTIONS.isEnabled());
+
+        assertEquals(
+            "1:31: second argument of [CATEGORIZE(last_name, first_name)] must be a map expression, received [first_name]",
+            error("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, first_name)")
+        );
+        assertEquals(
+            "1:31: Invalid option [blah] in [CATEGORIZE(last_name, { \"blah\": 42 })], "
+                + "expected one of [analyzer, output_format, similarity_threshold]",
+            error("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"blah\": 42 })")
+        );
+    }
+
+    public void testCategorizeOptionOutputFormat() {
+        assumeTrue("categorize options must be enabled", EsqlCapabilities.Cap.CATEGORIZE_OPTIONS.isEnabled());
+
+        query("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"output_format\": \"regex\" })");
+        query("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"output_format\": \"REGEX\" })");
+        query("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"output_format\": \"tokens\" })");
+        query("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"output_format\": \"ToKeNs\" })");
+        assertEquals(
+            "1:31: invalid output format [blah], expecting one of [REGEX, TOKENS]",
+            error("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"output_format\": \"blah\" })")
+        );
+        assertEquals(
+            "1:31: invalid output format [42], expecting one of [REGEX, TOKENS]",
+            error("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"output_format\": 42 })")
+        );
+    }
+
+    public void testCategorizeOptionSimilarityThreshold() {
+        assumeTrue("categorize options must be enabled", EsqlCapabilities.Cap.CATEGORIZE_OPTIONS.isEnabled());
+
+        query("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"similarity_threshold\": 1 })");
+        query("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"similarity_threshold\": 100 })");
+        assertEquals(
+            "1:31: invalid similarity threshold [0], expecting a number between 1 and 100, inclusive",
+            error("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"similarity_threshold\": 0 })")
+        );
+        assertEquals(
+            "1:31: invalid similarity threshold [101], expecting a number between 1 and 100, inclusive",
+            error("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"similarity_threshold\": 101 })")
+        );
+        assertEquals(
+            "1:31: Invalid option [similarity_threshold] in [CATEGORIZE(last_name, { \"similarity_threshold\": \"blah\" })], "
+                + "cannot cast [blah] to [integer]",
+            error("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"similarity_threshold\": \"blah\" })")
+        );
+    }
+
     public void testChangePoint() {
         assumeTrue("change_point must be enabled", EsqlCapabilities.Cap.CHANGE_POINT.isEnabled());
         var airports = AnalyzerTestUtils.analyzer(loadMapping("mapping-airports.json", "airports"));

+ 1 - 1
x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/grouping/CategorizeErrorTests.java

@@ -27,7 +27,7 @@ public class CategorizeErrorTests extends ErrorsForCasesWithoutExamplesTestCase
 
     @Override
     protected Expression build(Source source, List<Expression> args) {
-        return new Categorize(source, args.get(0));
+        return new Categorize(source, args.get(0), args.size() > 1 ? args.get(1) : null);
     }
 
     @Override

+ 1 - 1
x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/grouping/CategorizeTests.java

@@ -61,7 +61,7 @@ public class CategorizeTests extends AbstractScalarFunctionTestCase {
 
     @Override
     protected Expression build(Source source, List<Expression> args) {
-        return new Categorize(source, args.get(0));
+        return new Categorize(source, args.get(0), args.size() > 1 ? args.get(1) : null);
     }
 
     @Override

+ 1 - 1
x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/FoldNullTests.java

@@ -269,7 +269,7 @@ public class FoldNullTests extends ESTestCase {
     }
 
     public void testNullCategorizeGroupingNotFolded() {
-        Categorize categorize = new Categorize(EMPTY, NULL);
+        Categorize categorize = new Categorize(EMPTY, NULL, NULL);
         assertEquals(categorize, foldNull(categorize));
     }
 

+ 7 - 0
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/categorization/SerializableTokenListCategory.java

@@ -162,6 +162,13 @@ public class SerializableTokenListCategory implements Writeable {
         return Arrays.stream(keyTokenIndexes).mapToObj(index -> baseTokens[index]).toArray(BytesRef[]::new);
     }
 
+    public String getKeyTokensString() {
+        return Arrays.stream(keyTokenIndexes)
+            .mapToObj(index -> baseTokens[index])
+            .map(BytesRef::utf8ToString)
+            .collect(Collectors.joining(" "));
+    }
+
     public String getRegex() {
         if (keyTokenIndexes.length == 0 || orderedCommonTokenBeginIndex == orderedCommonTokenEndIndex) {
             return ".*";