Selaa lähdekoodia

Use standard tokenizer in ES|QL categorize (#129642)

* Use standard tokenizer in ES|QL categorize

* rename capability

* remove unused param

* [CI] Auto commit changes from spotless

---------

Co-authored-by: elasticsearchmachine <infra-root+elasticsearchmachine@elastic.co>
Jan Kuipers 4 kuukautta sitten
vanhempi
commit
1bcd9b4170

+ 16 - 0
x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/CategorizationAnalyzerConfig.java

@@ -210,6 +210,22 @@ public class CategorizationAnalyzerConfig implements ToXContentFragment, Writeab
             .build();
     }
 
+    /**
+     * Create a <code>categorization_analyzer</code> that will be used by the ES|QL categorize function.
+     * The only difference from the DSL analyzer is the tokenizer (standard instead of ml_standard).
+     * This means the results are slightly different from the categorize text aggregation and the ML job,
+     * however you can use these tokens for looking up messages in indices generated with the standard
+     * tokenizer. The latter is considered more important.
+     */
+    public static CategorizationAnalyzerConfig buildStandardEsqlCategorizationAnalyzer() {
+
+        return new CategorizationAnalyzerConfig.Builder().addCharFilter("first_line_with_letters")
+            .setTokenizer("standard")
+            .addDateWordsTokenFilter()
+            .addLimitFilter()
+            .build();
+    }
+
     private final String analyzer;
     private final List<NameOrDefinition> charFilters;
     private final NameOrDefinition tokenizer;

+ 2 - 4
x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHash.java

@@ -39,7 +39,6 @@ import org.elasticsearch.xpack.ml.job.categorization.CategorizationAnalyzer;
 
 import java.io.IOException;
 import java.util.HashMap;
-import java.util.List;
 import java.util.Map;
 import java.util.Objects;
 
@@ -48,9 +47,8 @@ import java.util.Objects;
  */
 public class CategorizeBlockHash extends BlockHash {
 
-    private static final CategorizationAnalyzerConfig ANALYZER_CONFIG = CategorizationAnalyzerConfig.buildStandardCategorizationAnalyzer(
-        List.of()
-    );
+    private static final CategorizationAnalyzerConfig ANALYZER_CONFIG = CategorizationAnalyzerConfig
+        .buildStandardEsqlCategorizationAnalyzer();
     private static final int NULL_ORD = 0;
 
     private final int channel;

+ 52 - 52
x-pack/plugin/esql/qa/testFixtures/src/main/resources/categorize.csv-spec

@@ -1,5 +1,5 @@
 standard aggs
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM sample_data
   | STATS count=COUNT(),
@@ -17,7 +17,7 @@ count:long | sum:long |     avg:double     | count_distinct:long | category:keyw
 ;
 
 values aggs
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM sample_data
   | STATS values=MV_SORT(VALUES(message)),
@@ -33,7 +33,7 @@ values:keyword                                                        |      top
 ;
 
 mv
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM mv_sample_data
   | STATS COUNT(), SUM(event_duration) BY category=CATEGORIZE(message)
@@ -48,7 +48,7 @@ COUNT():long | SUM(event_duration):long | category:keyword
 ;
 
 row mv
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 ROW message = ["connected to a", "connected to b", "disconnected"], str = ["a", "b", "c"]
   | STATS COUNT(), VALUES(str) BY category=CATEGORIZE(message)
@@ -61,7 +61,7 @@ COUNT():long | VALUES(str):keyword | category:keyword
 ;
 
 limit before stats
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM sample_data | SORT message | LIMIT 4
   | STATS count=COUNT() BY category=CATEGORIZE(message)
@@ -74,7 +74,7 @@ count:long | category:keyword
 ;
 
 skips stopwords
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 ROW message = ["Mon Tue connected to a", "Jul Aug connected to b September ", "UTC connected GMT to c UTC"]
   | STATS COUNT() BY category=CATEGORIZE(message)
@@ -86,7 +86,7 @@ COUNT():long | category:keyword
 ;
 
 with multiple indices
-required_capability: categorize_v5
+required_capability: categorize_v6
 required_capability: union_types
 
 FROM sample_data*
@@ -101,7 +101,7 @@ COUNT():long | category:keyword
 ;
 
 mv with many values
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM employees
   | STATS COUNT() BY category=CATEGORIZE(job_positions)
@@ -118,7 +118,7 @@ COUNT():long | category:keyword
 ;
 
 mv with many values and SUM
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM employees
   | STATS SUM(languages) BY category=CATEGORIZE(job_positions)
@@ -133,7 +133,7 @@ SUM(languages):long | category:keyword
 ;
 
 mv with many values and nulls and SUM
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM employees
   | STATS SUM(languages) BY category=CATEGORIZE(job_positions)
@@ -147,7 +147,7 @@ SUM(languages):long | category:keyword
 ;
 
 mv via eval
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM sample_data
   | EVAL message = MV_APPEND(message, "Banana")
@@ -163,7 +163,7 @@ COUNT():long | category:keyword
 ;
 
 mv via eval const
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM sample_data
   | EVAL message = ["Banana", "Bread"]
@@ -177,7 +177,7 @@ COUNT():long | category:keyword
 ;
 
 mv via eval const without aliases
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM sample_data
   | EVAL message = ["Banana", "Bread"]
@@ -191,7 +191,7 @@ COUNT():long | CATEGORIZE(message):keyword
 ;
 
 mv const in parameter
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM sample_data
   | STATS COUNT() BY c = CATEGORIZE(["Banana", "Bread"])
@@ -204,7 +204,7 @@ COUNT():long | c:keyword
 ;
 
 agg alias shadowing
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM sample_data
   | STATS c = COUNT() BY c = CATEGORIZE(["Banana", "Bread"])
@@ -219,7 +219,7 @@ c:keyword
 ;
 
 chained aggregations using categorize
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM sample_data
   | STATS COUNT() BY category=CATEGORIZE(message)
@@ -234,7 +234,7 @@ COUNT():long | category:keyword
 ;
 
 stats without aggs
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM sample_data
   | STATS BY category=CATEGORIZE(message)
@@ -248,7 +248,7 @@ category:keyword
 ;
 
 text field
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM hosts
   | STATS COUNT() BY category=CATEGORIZE(host_group)
@@ -256,9 +256,9 @@ FROM hosts
 ;
 
 COUNT():long | category:keyword
+           2 | .*?DB.+?servers.*?
            2 | .*?Gateway.+?instances.*?
            5 | .*?Kubernetes.+?cluster.*?
-           2 | .*?servers.*?
            1 | null
 
 // Note: DB is removed from "DB servers", because the ml_standard
@@ -266,7 +266,7 @@ COUNT():long | category:keyword
 ;
 
 on TO_UPPER
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM sample_data
   | STATS COUNT() BY category=CATEGORIZE(TO_UPPER(message))
@@ -280,7 +280,7 @@ COUNT():long | category:keyword
 ;
 
 on CONCAT
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM sample_data
   | STATS COUNT() BY category=CATEGORIZE(CONCAT(message, " banana"))
@@ -294,7 +294,7 @@ COUNT():long | category:keyword
 ;
 
 on CONCAT with unicode
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM sample_data
   | STATS COUNT() BY category=CATEGORIZE(CONCAT(message, " 👍🏽😊"))
@@ -302,13 +302,13 @@ FROM sample_data
 ;
 
 COUNT():long | category:keyword
-           3 | .*?Connected.+?to.*?
-           3 | .*?Connection.+?error.*?
-           1 | .*?Disconnected.*?
+3            | .*?Connected.+?to.+?👍🏽.+?😊.*?
+3            | .*?Connection.+?error.+?👍🏽.+?😊.*?
+1            | .*?Disconnected.+?👍🏽.+?😊.*?
 ;
 
 on REVERSE(CONCAT())
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM sample_data
   | STATS COUNT() BY category=CATEGORIZE(REVERSE(CONCAT(message, " 👍🏽😊")))
@@ -316,13 +316,13 @@ FROM sample_data
 ;
 
 COUNT():long | category:keyword
-           1 | .*?detcennocsiD.*?
-           3 | .*?ot.+?detcennoC.*?
-           3 | .*?rorre.+?noitcennoC.*?
+1            | .*?😊.+?👍🏽.+?detcennocsiD.*?
+3            | .*?😊.+?👍🏽.+?ot.+?detcennoC.*?
+3            | .*?😊.+?👍🏽.+?rorre.+?noitcennoC.*?
 ;
 
 and then TO_LOWER
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM sample_data
   | STATS COUNT() BY category=CATEGORIZE(message)
@@ -337,7 +337,7 @@ COUNT():long | category:keyword
 ;
 
 on const empty string
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM sample_data
   | STATS COUNT() BY category=CATEGORIZE("")
@@ -349,7 +349,7 @@ COUNT():long | category:keyword
 ;
 
 on const empty string from eval
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM sample_data
   | EVAL x = ""
@@ -362,7 +362,7 @@ COUNT():long | category:keyword
 ;
 
 on null
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM sample_data
   | EVAL x = null
@@ -375,7 +375,7 @@ COUNT():long | SUM(event_duration):long | category:keyword
 ;
 
 on null string
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM sample_data
   | EVAL x = null::string
@@ -388,7 +388,7 @@ COUNT():long | category:keyword
 ;
 
 on const null
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM sample_data
   | STATS COUNT(), SUM(event_duration) BY category=CATEGORIZE(null)
@@ -400,7 +400,7 @@ COUNT():long | SUM(event_duration):long | category:keyword
 ;
 
 on null row
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 ROW message = null, str = ["a", "b", "c"]
 | STATS COUNT(), VALUES(str) BY category=CATEGORIZE(message)
@@ -411,7 +411,7 @@ COUNT():long | VALUES(str):keyword | category:keyword
 ;
 
 filtering out all data
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM sample_data
   | WHERE @timestamp < "2023-10-23T00:00:00Z"
@@ -423,7 +423,7 @@ COUNT():long | category:keyword
 ;
 
 filtering out all data with constant
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM sample_data
   | STATS COUNT() BY category=CATEGORIZE(message)
@@ -434,7 +434,7 @@ COUNT():long | category:keyword
 ;
 
 drop output columns
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM sample_data
   | STATS count=COUNT() BY category=CATEGORIZE(message)
@@ -449,7 +449,7 @@ x:integer
 ;
 
 category value processing
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 ROW message = ["connected to a", "connected to b", "disconnected"]
   | STATS COUNT() BY category=CATEGORIZE(message)
@@ -463,7 +463,7 @@ COUNT():long | category:keyword
 ;
 
 row aliases
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 ROW message = "connected to xyz"
   | EVAL x = message
@@ -477,7 +477,7 @@ COUNT():long | category:keyword           | y:keyword
 ;
 
 from aliases
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM sample_data
   | EVAL x = message
@@ -493,7 +493,7 @@ COUNT():long | category:keyword         | y:keyword
 ;
 
 row aliases with keep
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 ROW message = "connected to xyz"
   | EVAL x = message
@@ -509,7 +509,7 @@ COUNT():long | y:keyword
 ;
 
 from aliases with keep
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM sample_data
   | EVAL x = message
@@ -527,7 +527,7 @@ COUNT():long | y:keyword
 ;
 
 row rename
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 ROW message = "connected to xyz"
   | RENAME message as x
@@ -541,7 +541,7 @@ COUNT():long | y:keyword
 ;
 
 from rename
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM sample_data
   | RENAME message as x
@@ -557,7 +557,7 @@ COUNT():long | y:keyword
 ;
 
 row drop
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 ROW message = "connected to a"
   | STATS c = COUNT() BY category=CATEGORIZE(message)
@@ -570,7 +570,7 @@ c:long
 ;
 
 from drop
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM sample_data
   | STATS c = COUNT() BY category=CATEGORIZE(message)
@@ -585,7 +585,7 @@ c:long
 ;
 
 reuse categorize arg expression in agg
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM sample_data
 | STATS m = MAX(LENGTH(CONCAT(message, "_end"))) BY c = CATEGORIZE(CONCAT(message, "_end"))
@@ -600,7 +600,7 @@ m:integer      |c:keyword
 
 
 categorize in aggs inside function
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM sample_data
   | STATS COUNT(), x = MV_APPEND(category, category) BY category=CATEGORIZE(message)
@@ -615,7 +615,7 @@ COUNT():long | x:keyword
 ;
 
 categorize in aggs same as grouping inside function
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM sample_data
   | STATS COUNT(), x = MV_APPEND(CATEGORIZE(message), `CATEGORIZE(message)`) BY CATEGORIZE(message)
@@ -630,7 +630,7 @@ COUNT():long | x:keyword
 ;
 
 categorize in aggs same as grouping inside function with explicit alias
-required_capability: categorize_v5
+required_capability: categorize_v6
 
 FROM sample_data
   | STATS COUNT(), x = MV_APPEND(CATEGORIZE(message), category) BY category=CATEGORIZE(message)

+ 1 - 1
x-pack/plugin/esql/qa/testFixtures/src/main/resources/docs.csv-spec

@@ -690,7 +690,7 @@ Bangalore     | 9                 | 72
 ;
 
 docsCategorize
-required_capability: categorize_v5
+required_capability: categorize_v6
 // tag::docsCategorize[]
 FROM sample_data
 | STATS count=COUNT() BY category=CATEGORIZE(message)

+ 1 - 1
x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java

@@ -665,7 +665,7 @@ public class EsqlCapabilities {
         /**
          * Supported the text categorization function "CATEGORIZE".
          */
-        CATEGORIZE_V5,
+        CATEGORIZE_V6,
 
         /**
          * Support for multiple groupings in "CATEGORIZE".